# Modules & Configuration

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import compute_class_weight
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from statistics import mean
from sklearn.model_selection import StratifiedKFold

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from collections import Counter
import multiprocessing

In [2]:
# Configuration

# Pandas
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 2)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

n_cpus = multiprocessing.cpu_count()

# Seaborn
sns.set()
color_blue = "#0089fa"
color_pink = "#ff0051"
colors = [color_blue, color_pink]
customPalette = sns.set_palette(sns.color_palette(colors))
customDiverging = sns.diverging_palette(207, 341, as_cmap=True)

# Avoid warnings
#warnings.filterwarnings('ignore')

print("Available CPUs: ", n_cpus)

Available CPUs:  16


# Data Loading

In [3]:
#Load CSV fie
CBC_file_dir = "data/ProcessedData-2021-Filtrados.csv"
CBC = pd.read_csv(CBC_file_dir, error_bad_lines=True)
CBC["Clase"] = CBC["Clase"].astype(int)
print('File loaded, shape:(%s, %s)' %((len(CBC), len(CBC.iloc[0]))))

File loaded, shape:(3295, 18)




  CBC = pd.read_csv(CBC_file_dir, error_bad_lines=True)


# Data Preprocessing

In [4]:
CBC = CBC[CBC['Clase'] != 2] 
#CBC['Clase'] = CBC['Clase'].replace(to_replace = 2, value = 1)
CBC['Clase'] = CBC['Clase'].replace(to_replace = 3, value = 1) 
CBC['Clase'] = CBC['Clase'].replace(to_replace = 4, value = 1) 

In [5]:
healthy = CBC.loc[CBC['Clase'] == 0]
diseased = CBC.loc[CBC['Clase'] == 1]

CBC = pd.concat([healthy,diseased])
print(CBC['Clase'].value_counts(sort=False))

0     863
1    1634
Name: Clase, dtype: int64


In [6]:
CBC = CBC.sample(frac = 1)

labels = CBC['Clase']
CBC = CBC.drop('Clase', axis=1)
CBC = CBC.drop('TipoClase', axis=1)
CBC.replace(0,CBC.mean(axis=0),inplace=True)

In [7]:
print(CBC.columns)

Index(['Edad', 'WBC', 'LYAB', 'MOAB', 'NEAB', 'EO', 'BA', 'HEMA', 'HGB', 'HTO', 'VCM', 'MCH', 'CHCM', 'RDW', 'PLT', 'VPM'], dtype='object')


In [8]:
CBC = CBC.to_numpy()

# XGBoost

In [9]:
params = {
        'min_child_weight': [3, 5, 7],
        'gamma': [0.8, 1, 1.2],
        'subsample': [0.3, 0.5],
        'colsample_bytree': [0.3, 0.8, 1],
        'max_depth': [2, 4, 6],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [1000, 2000, 3000],
        'scale_pos_weights' : [0.55],
        'reg_lambda' : [1],
        'reg_alpha' : [0]
        }

folds = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True)

In [10]:
model = XGBClassifier(objective='binary:logistic')
grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', n_jobs=8, cv=skf.split(CBC,labels), verbose=3 )
grid.fit(CBC, labels)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
Parameters: { "scale_pos_weights" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [11]:
print('\n Best parameters:')
print(grid.best_params_)

print('\n Best score:')
print(grid.best_score_ )


 Best parameters:
{'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 3000, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weights': 0.55, 'subsample': 0.5}

 Best score:
0.9411350701402805


In [12]:
X_train, X_test, y_train, y_test = train_test_split(CBC, labels, test_size = 0.3, stratify=labels)

In [13]:
model = XGBClassifier(n_jobs=0, objective='binary:logistic',
                 colsample_bytree=grid.best_params_["colsample_bytree"],
                 gamma=grid.best_params_["gamma"],
                 learning_rate=grid.best_params_["learning_rate"],
                 max_depth=grid.best_params_["max_depth"],
                 min_child_weight=grid.best_params_["min_child_weight"],
                 n_estimators=grid.best_params_["n_estimators"],
                 subsample=grid.best_params_["subsample"],
                 scale_pos_weights=grid.best_params_["scale_pos_weights"],
                 reg_lambda=grid.best_params_["reg_lambda"],
                 reg_alpha=grid.best_params_["reg_alpha"]
                 )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Parameters: { "scale_pos_weights" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy: 93.47%


In [14]:
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred).numpy()
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)
con_mat_df = pd.DataFrame(con_mat_norm, index = ['Healthy', 'Sick'], columns = ['Healthy', 'Sick'])

print('Model Accuracy: ', accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred)
print(report)

Model Accuracy:  0.9346666666666666
              precision    recall  f1-score   support

           0       0.93      0.88      0.90       259
           1       0.94      0.96      0.95       491

    accuracy                           0.93       750
   macro avg       0.93      0.92      0.93       750
weighted avg       0.93      0.93      0.93       750

