In [None]:
import pandas as pd

import numpy as np

from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc


import matplotlib.pyplot as plt
import seaborn as sns

# Reading the Input File

- **Caution** : Before Running
  - Change the file name, figure and model file information
  - Change the nClasses Value

In [None]:
inputFile = 'GS103-GC-Protein-Cons-Cond.csv'

label = 'GS103-GC-Protein-Condition'

In [None]:
#GSE136103 Condition
nClasses = 2

In [None]:
%%time
#df = pd.read_csv(inputFile)
headers = [*pd.read_csv(inputFile, nrows=1)]

df = pd.read_csv(inputFile, dtype= np.int8, converters={'condition':str} )


# Pre-Processing
- Drop the *cell Bar Code* Column (if present)
- Extract features (x) and Labels (y)
- Divides the data set into train and test sets
- Converts dummy variables of labels (y) into normal labels (only for multi class)

In [None]:
#Make Sure to delete all columns except the data
if(headers[0].startswith("cell")):
  df.drop(headers[0], inplace=True, axis=1)

headers = df.columns

In [None]:
# Extract features 
x_columns = df.columns.drop('condition')

x = df[x_columns].values
#extracting the labels
y = df['condition'].values

In [None]:
# Split into train/test

x_train, x_test, y_train,y_test= train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Model Development

- Initializes the model based on specific values
- Fits/train the model

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_test_le = le.fit_transform(y_test)
y_train_le = le.fit_transform(y_train)

In [None]:
%%time

model = XGBClassifier(n_estimators = 400, objective='binary:logistic', learning_rate=0.1, 
                      nthread=1, subsample=0.75, max_depth=25, verbosity=3, tree_method="hist")

model.fit(x_train, y_train_le)

In [None]:
y_pred_le = model.predict(x_test)

y_pred = le.inverse_transform(y_pred_le)

acc = metrics.accuracy_score(y_test, y_pred)
print(acc)


# Model Evaluation

- Calculates the following metrics
  - TPs, TNs, FPs, FNs
  - Accuracy
  - Specificity
  - Recall (Sensitivity)
  - Precision
  - F1 Score

In [None]:
# confusion Matrix
cf = metrics.confusion_matrix(y_test, y_pred)

cf

In [None]:

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(tp)
print(tn)
print(fp)
print(fn)

In [None]:
## Calculates the Accuracy, Specifity, Precision, Sensitivity (Recall) and F1 score


# Specificity
specificity = tn / (tn + fp)

#specificity = sum(spec) / len(spec)

acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec  = metrics.recall_score(y_test, y_pred, average='macro')
f1 = metrics.f1_score(y_test, y_pred, average='macro')


# Confusion Matrix
  - Creates the confusion matrix
  - Draws the confusion matrix
  - Saves the confusion matrix as csv, png, and pdf files

In [None]:
# Builds the confusion matrix

matrix = confusion_matrix(y_test, y_pred)

# Build the plot
plt.figure(figsize=(20,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':14},
            cmap=plt.cm.Greens, linewidths=0.2, fmt=".0f")


# Add labels to the plot
class_names = np.unique(y_test)
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title("%s  (Accuracy : %.2f %%)" % (label, acc*100))

plt.draw()
plt.savefig(graphFile+"_CM.png", dpi =900, bbox_inches = 'tight')
plt.savefig(graphFile+"_CM.pdf", dpi =900, bbox_inches = 'tight')
plt.show()
#plt.close()

In [None]:
%%time
y_prob = model.predict_proba(x_test)

fpr, tpr, _ = roc_curve(y_test_le, y_prob[:,1])
roc_auc = auc(fpr, tpr)

In [None]:
y_prob = model.predict_proba(x_test)

fpr, tpr, thresh = roc_curve(y_test_le, y_prob[:,1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(20,10))

plt.plot(fpr, tpr)
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve :' + label)
plt.legend(loc="lower right")

plt.draw()
plt.show()