In [None]:
import pandas as pd

import numpy as np

from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc


import matplotlib.pyplot as plt
import seaborn as sns

import gc

# Reading the Input File

- **Caution** : Before Running
  - Change the file name, figure and model file information
  - Change the nClasses Value

In [None]:
inputFile = 'pbmc_10k_v3-GC-Protein-Consolidated-Abridged-Dummy.csv'

label = 'pbmc-GC-Protein-CompleteClasses-CellType'

In [None]:
#GSE115469  Classes Complete Classes
#nClasses = 20

#GSE115469  Classes Merged Classes
#nClasses = 11

#PBMC  Classes
nClasses = 13

#GSE136103 CellType
#nClasses = 12

In [None]:
headers = [*pd.read_csv(inputFile, nrows=1)]
print(headers[0])

if(headers[0].startswith("cell")):
  df = pd.read_csv(inputFile, dtype= np.int8, converters={headers[0]:str} )
else:
  df = pd.read_csv(inputFile, dtype= np.int8)

In [None]:
print(df.shape)
df.head()

# Pre-Processing
- Drop the *cell Bar Code* Column (if present)
- Extract features (x) and Labels (y)
- Divides the data set into train and test sets
- Converts dummy variables of labels (y) into normal labels

In [None]:
if(headers[0].startswith("cell")):
  df.drop(headers[0], inplace=True, axis=1)

headers = df.columns

In [None]:
label_cols = df.columns[-nClasses:]
label_cols

In [None]:
%%time
# Extract features 
x_columns = df.columns.drop(label_cols)

x = df[x_columns].values
#extracting the labels
y = df[label_cols].to_numpy()

In [None]:
%%time
# Split into train/test

x_train, x_test, y_train_dummies,y_test_dummies = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

y_train_label = np.argmax(y_train_dummies, axis=1)
y_test_label = np.argmax(y_test_dummies, axis=1)

In [None]:
del df
gc.collect()

# Model Development

- Initializes the model based on specific values
- Fits/train the model

In [None]:
%%time

model = XGBClassifier(n_estimators = 25, objective='multi:softmax', nthread=4,
                      max_depth=25, num_class=nClasses, verbosity=1, tree_method="hist")

model.fit(x_train, y_train_label)  

y_pred_label = model.predict(x_test)
acc = metrics.accuracy_score(y_test_label, y_pred_label)
print(acc)

# Model Evaluation

- Calculates the following metrics
  - TPs, TNs, FPs, FNs
  - Accuracy
  - Specificity
  - Recall (Sensitivity)
  - Precision
  - F1 Score

In [None]:
y_pred_dummies = pd.get_dummies(y_pred_label)
y_pred_dummies = y_pred_dummies.to_numpy()

mcm = metrics.multilabel_confusion_matrix(y_test_dummies, y_pred_dummies)
acc = metrics.accuracy_score(y_test_dummies, y_pred_dummies)

In [None]:
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]

In [None]:
## Calculates the Accuracy, Specifity, Precision, Sensitivity (Recall) and F1 score

# Specificity
spec = tn / (tn + fp)
specificity = sum(spec) / len(spec)

acc = metrics.accuracy_score(y_test_label, y_pred_label)
prec = metrics.precision_score(y_test_label, y_pred_label, average='macro')
rec  = metrics.recall_score(y_test_label, y_pred_label, average='macro')
f1 = metrics.f1_score(y_test_label, y_pred_label, average='macro')

print("Accuracy score: {}".format(acc))
print("Prec score: {}".format(prec))
print("Recall score: {}".format(rec))
print("F1 score: {}".format(f1))
print("Specifity score: {}".format(specificity))

# Confusion Matrix
  - Creates the confusion matrix
  - Draws the confusion matrix
  - Saves the confusion matrix as csv, png, and pdf files

In [None]:
# Builds the confusion matrix
matrix = confusion_matrix(y_test_label, y_pred_label)


# Build the plot
plt.figure(figsize=(20,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':14},
            cmap=plt.cm.Greens, linewidths=0.2, fmt=".0f")


# Add labels to the plot
class_names = label_cols
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

plt.title("%s  (Accuracy : %.2f %%)" % (label, acc*100))

plt.draw()
plt.show()
#plt.close()

In [None]:
# Plot all ROC curves

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

plt.figure(figsize=(20,10))

for i in range(nClasses):
    fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_pred_dummies[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(nClasses):
    plt.plot(fpr[i], tpr[i], label='ROC Curve for %s (AUC = %0.2f)' % (label_cols[i], roc_auc[i]))
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve :' + label)
plt.legend(loc="lower right")

plt.draw()
plt.show()