In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy,pickle,glob

from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc,f1_score,precision_score,recall_score,precision_recall_curve

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier

from sklearn.decomposition import TruncatedSVD
from genetic_selection import GeneticSelectionCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_split(samples, labels, test_size):
#     return train_test_split(samples, labels,stratify=labels, test_size=test_size,random_state=45)
    return train_test_split(samples, labels,test_size=test_size,random_state=40)

def draw_confusion_matrix(y_actual, y_predicted):
    cm=confusion_matrix(y_actual, y_predicted)
    labels = ['POsivite', 'Negative']
    no_of_labels = 2
    print("\nConfusion matrix : ")
    x = PrettyTable()
    print("                   Predicted labels →")
    x.field_names = ["Actual labels ↓"] + [str(labels[i]) for i in range(no_of_labels)]
    for i in range(no_of_labels):
        ls = [(str(labels[i]))] + list(cm[i])
        x.add_row(ls)
    print(x) 

def draw_roc(y_actual, scores, title=''):
    fpr, tpr, thresholds = roc_curve(y_actual, scores, pos_label=1)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,tpr, label = 'AUC: '+str(round(roc_auc, 4)))
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('1-Specificity = FPR')
    plt.ylabel("Sensitivity = TPR = Recall")
    plt.title("AUC-ROC Curve: " + title)
    plt.legend(loc = 'lower right')
    plt.savefig(title+'-ROC')
    plt.show()
    
def metric(y_test, y_pred,scores, title=''):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn /(tn + fp)
    sensitivity = tp/(tp + fn)
    precision = tp/(tp + fp)
    f1 = (precision * sensitivity * 2) / (precision + sensitivity)
    acc = accuracy_score(y_test, y_pred)
    npv = tn/(tn + fn)
    fpr = fp/(fp + tn)
    rmc = 1.0 - acc
    x = PrettyTable()
    metr_list = []
    x.field_names = ["Evaluation Metric", "Score"]
    x.add_row(["Accuracy", round(acc, 4)])
    metr_list.append(round(acc, 4))
    x.add_row(["Specificity", round(specificity, 4)])
    metr_list.append(round(specificity, 4))
    x.add_row(["Sensitivity", round(sensitivity, 4)])
    metr_list.append(round(sensitivity, 4))
    x.add_row(["Precision", round(precision, 4)])
    metr_list.append(round(precision, 4))
    x.add_row(["NPV", round(npv, 4)])
    metr_list.append(round(npv, 4))
    x.add_row(["FPR", round(fpr, 4)])
    metr_list.append(round(fpr, 4))
    x.add_row(["RMC", round(rmc, 4)])
    metr_list.append(round(rmc, 4))
    x.add_row(["F1 score", round(f1, 4)])
    metr_list.append(round(f1, 4))
    print(x)
    draw_confusion_matrix(y_test, y_pred)
    draw_roc(y_test,scores, title)
    print('--------------------------------------------------------------\n\n')
    return metr_list

def rc(clfs, dt, lbel, names):
    data = copy.deepcopy(dt)
    label = copy.deepcopy(lbel)
    nfpr = []
    ntpr = []
    pfpr = []
    ptpr = []
    label[label == 1] = 2
    label[label == 0] = 1
    neg_lbl = copy.deepcopy(label)
    pos_lbl = copy.deepcopy(label)
    neg_lbl[neg_lbl == 1] = 1 
    neg_lbl[neg_lbl == 2] = 0
    pos_lbl[pos_lbl == 1] = 0
    pos_lbl[pos_lbl == 2] = 1
    for i in range(len(clfs)):
        prob = clfs[i].predict_proba(data)
        negatives = prob[:,:1]
        positives = prob[:,1:]
        fpr, tpr, thresholds = roc_curve(neg_lbl, negatives, pos_label=None)
        nfpr.append(fpr)
        ntpr.append(tpr)
        fpr, tpr, thresholds = roc_curve(pos_lbl, positives, pos_label=None)
        pfpr.append(fpr)
        ptpr.append(tpr)
    
    for i in range(len(names)):
        plt.plot(nfpr[i], ntpr[i], label = names[i])
    plt.plot([0,1],[0,1],'k--')
    plt.legend(loc = 'lower right')
    plt.xlabel("True Positive Rate")
    plt.ylabel("False Positive Rate")
    plt.title('Negative Class All Models ROC curves')
    plt.savefig('Negative_Class-ROC')
    plt.show()
    for i in range(len(names)):
        plt.plot(pfpr[i], ptpr[i], label = names[i])
    plt.plot([0,1],[0,1],'k--')
    plt.legend(loc = 'lower right')
    plt.xlabel("True Positive Rate")
    plt.ylabel("False Positive Rate")
    plt.title('Positive Class All Models ROC curves')
    plt.savefig('Positive_Class-ROC')
    plt.show()

In [None]:
def feature_selection(train_X, test_X):
    pass

### Loading Data

In [3]:
complete_X_train = pd.read_csv('outputs/A/MT18117_q1_complete_X_train.csv')
complete_X_test = pd.read_csv('outputs/A/MT18117_q1_complete_X_test.csv')
complete_Y_train = pd.read_csv('outputs/A/MT18117_q1_complete_Y_train.csv',names=['Group'])
complete_Y_test = pd.read_csv('outputs/A/MT18117_q1_complete_Y_test.csv',names=['Group'])

In [4]:
complete_X_train.shape, complete_X_test.shape, complete_Y_train.shape, complete_Y_test.shape

((237, 29459), (103, 29459), (237, 1), (103, 1))

In [5]:
X_train = complete_X_train.values
X_test = complete_X_test.values
y_train = complete_Y_train.values
y_test = complete_Y_test.values

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((237, 29459), (103, 29459), (237, 1), (103, 1))

### Normalizing

In [7]:
X_train_scale = scale(X_train)
X_test_scale = scale(X_test)

### Genetic Algo with Logistic

In [10]:
estimator = LogisticRegression(solver="liblinear")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
selector = GeneticSelectionCV(estimator,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=5,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)

In [13]:
selector = selector.fit(X_train_scale, y_train)

Selecting features with genetic algorithm.
gen	nevals	avg                  	std                      	min              	max              
0  	50    	[-10000.    14707.06]	[ 0.         94.33120587]	[-10000.  14488.]	[-10000.  14944.]
1  	33    	[-10000.    14616.12]	[ 0.         80.31130431]	[-10000.  14459.]	[-10000.  14804.]
2  	38    	[-10000.    14537.46]	[ 0.         56.93301678]	[-10000.  14421.]	[-10000.  14679.]
3  	29    	[-10000.    14492.22]	[ 0.         63.73108818]	[-10000.  14276.]	[-10000.  14655.]
4  	27    	[-10000.  14451.]    	[ 0.         53.72559911]	[-10000.  14276.]	[-10000.  14586.]
5  	25    	[-10000.    14407.86]	[ 0.         57.55519438]	[-10000.  14276.]	[-10000.  14570.]
6  	28    	[-10000.    14369.48]	[ 0.         51.74716997]	[-10000.  14244.]	[-10000.  14514.]
7  	35    	[-10000.    14347.92]	[ 0.         54.82438873]	[-10000.  14208.]	[-10000.  14516.]
8  	33    	[-10000.    14314.76]	[ 0.        50.0130223]  	[-10000.  14202.]	[-10000.  14461.]
9  	29 

In [83]:
y_pred = selector.predict(X_test_scale)

In [84]:
accuracy_score(y_test, y_pred)

f1_score(y_test, y_pred, pos_label='sPTD')

recall_score(y_test, y_pred, pos_label='sPTD')

precision_score(y_test, y_pred, pos_label='sPTD')

0.8155339805825242

In [37]:
print(len(selector.support_))
selector.n_features_

29459


13595

In [35]:
indices = []
for i in range(len(selector.support_)):
    if(selector.support_[i]==True):
        indices.append(i)

In [89]:
# import joblib
# joblib.dump(indices, 'Features.sav')

# features = joblib.load('Features.sav')

In [54]:
# X_new_complete_train = complete_X_train.iloc[:,indices]
# X_new_complete_test = complete_X_test.iloc[:,indices]

# X_new_train = X_new_complete_train.values
# X_new_test = X_new_complete_test.values

# X_new_train_scale = scale(X_new_train)
# X_new_test_scale = scale(X_new_test)

estimator = estimator.fit(X_new_complete_train, y_train)
pred = estimator.predict(X_new_complete_test)

accuracy_score(y_test, pred)
# precision_score(y_test, pred,pos_label='sPTD')
# recall_score(y_test, pred, pos_label='sPTD')
# f1_score(y_test, svm_pred,pos_label='sPTD')

### Logistic Regression

In [None]:
tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]
logistic = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'f1', cv=5,n_jobs=-1)
# print(cross_val_score(logistic, X_train_scale, y_train, cv=10))
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
scores=logistic.predict_proba(X_test)[:,1]

print('Reults for Logistics regression -')
metr_list = metric(y_test, y_pred, scores, 'Logistics regression')
common_metr_table.append(metr_list)
metr_table_field_names.append('Log regr')

### SVM

In [None]:
Cs = [0.001, 0.01, 0.1, 1, 10,100,1000]
gammas = [0.0001,0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
rbfsvm = GridSearchCV(svm.SVC(kernel='rbf',probability=True), param_grid, cv=5)
rbfsvm.fit(X_train_scale, y_train)
y_pred = rbfsvm.predict(X_test_scale)
scores=rbfsvm.predict_proba(X_test_scale)[:,1]
print('Reults for SVM classifier -')
metr_list = metric(y_test, y_pred, scores, 'SVM')
common_metr_table.append(metr_list)
metr_table_field_names.append('SVM')

### Decision Trees

In [None]:
depth = np.linspace(1, 32, 32, endpoint=True)
min_split = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_lef = np.linspace(0.1, 0.5, 5, endpoint=True)
max_feat = list(range(1,X_train.shape[1]))
param_grid = {'max_depth': depth, 'min_samples_split' : min_split,'min_samples_leaf' : min_samples_lef,'max_features' : max_feat}
dtree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5,n_jobs=-1)
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
scores=dtree.predict_proba(X_test)[:,1]
print('Reults for Decision Tree classifier -')
metr_list = metric(y_test, y_pred, scores, 'Decision Tree')
common_metr_table.append(metr_list)
metr_table_field_names.append('Dec Tree')

### MLP

In [None]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3,2), random_state=1)
# print(cross_val_score(mlp, X_train, y_train, cv=10))
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
scores=mlp.predict_proba(X_test)[:,1]
print('Reults for MultiLayer Perceptron classifier -')
metr_list = metric(y_test, y_pred,scores, 'MultiLayer Perceptron')
common_metr_table.append(metr_list)
metr_table_field_names.append('ANN')

### Random Forest

In [40]:
# depth = np.linspace(1, 32, 32, endpoint=True)
# min_split = np.linspace(0.1, 1.0, 10, endpoint=True)
# min_samples_lef = np.linspace(0.1, 0.5, 5, endpoint=True)
# est = [20,21,22,23,24,25,26]
# param_grid = {'n_estimators' : est,'max_depth': depth, 'min_samples_split' : min_split,'min_samples_leaf' : min_samples_lef}

# rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=2,n_jobs=-1)
# rf.fit(X_train_scale, y_train)

# y_pred = rf.predict(X_train_scale)
# accuracy_score(y_train, y_pred)