# Oversampling and neural net


### Import Libraries and set constant random state 

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix , classification_report
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
tf.random.set_seed(1234)
import os
import random
from keras.layers import LeakyReLU

In [2]:
os.environ['PYTHONHASHSEED']=str(1234)
tf.random.set_seed(1234)
np.random.seed(1234)
random.seed(1234)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras import optimizers
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
from mlxtend.plotting import plot_decision_regions
%matplotlib inline


### Read datasets

In [4]:
PCA_df = pd.read_csv(r"Sets/pca_3.csv",index_col=[0])
PCA_df

FileNotFoundError: [Errno 2] No such file or directory: 'Sets/pca_3.csv'

In [None]:
df = pd.read_csv(r"Sets/drug_onehot_latest.csv")
df

### Run the Network

In [None]:
q = 'Atorvastatin calcium tablet'

In [None]:
print("Running for :",q)
trainer = pd.concat([PCA_df, df[q]], axis = 1)

In [None]:
# trainer

In [None]:
count_class_0, count_class_1 = trainer[q].value_counts()[0], trainer[q].value_counts()[1]

# Divide by class
df_class_0 = trainer[trainer[q] == 0]
df_class_1 = trainer[trainer[q] == 1]
if count_class_0>count_class_1:
    df_class_1_over = df_class_1.sample(count_class_0, replace=True)
    df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
else:
    df_class_0_over = df_class_0.sample(count_class_1,replace=True)
    df_test_over = pd.concat([df_class_0_over, df_class_1], axis=0)

print('Random over-sampling:')
print(df_test_over[q].value_counts())
X = df_test_over.drop(q,axis='columns')
y = df_test_over[q]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
loss = keras.losses.BinaryCrossentropy()
weights = -1


In [None]:
# from keras.callbacks import History 
# history = History()

In [None]:
# from keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=4, mode='auto')

> Works well for Atorvastatin

In [None]:
# DO NOT Modify
reg_model = Sequential()
reg_model.add(Dense(128, input_dim=46, activation='relu', kernel_regularizer='l2'))
# reg_model.add(Dense(128, activation='relu'))
# reg_model.add(Dropout(0.2))
reg_model.add(Dropout(0.2))
reg_model.add(Dense(64, activation='relu',kernel_regularizer='l2'))
reg_model.add(Dropout(0.2))
reg_model.add(Dense(32, activation='relu',  kernel_regularizer='l2'))
reg_model.add(Dense(1, activation='sigmoid'))
reg_model.compile(loss='binary_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])


his = reg_model.fit(X_train, y_train, 
                            validation_data=(X_test, y_test), 
                            epochs=100, verbose=0)

print(reg_model.evaluate(X_test, y_test))

y_preds = reg_model.predict(X_test)
y_preds = np.round(y_preds)

print("Classification Report: \n", classification_report(y_test, y_preds))

### Plot characteristic curves

In [None]:
plt.plot(his.history['loss'], label='train')
plt.plot(his.history['val_loss'], label='test')
plt.legend()
plt.show()


In [None]:
#Predict on test set
predictions_NN_prob = reg_model.predict(X_test)
predictions_NN_prob = predictions_NN_prob[:,0]

predictions_NN_01 = np.where(predictions_NN_prob > 0.5, 1, 0)

In [None]:
#Print accuracy
acc_NN = accuracy_score(y_test, predictions_NN_01)
print('Overall accuracy of Neural Network model:', acc_NN)

In [None]:
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions_NN_prob)
roc_auc = auc(false_positive_rate, recall)
plt.figure()
plt.title('Receiver Operating Characteristic of Atorvastatin (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

In [None]:
cm = confusion_matrix(y_test, predictions_NN_01)
labels = ['No Default', 'Default']
plt.figure(figsize=(8,6))
sns.heatmap(cm,xticklabels=labels, yticklabels=labels, annot=True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix of Atorvastatin')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()