In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import decomposition
from sklearn.decomposition import FastICA
from sklearn.feature_selection import SelectPercentile as sp
from matplotlib import pyplot
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv("../input/cnsdataset/CNS.csv")
data

In [3]:
last_row = data["1.22"]
last_row

In [4]:
data.drop("1.22", inplace = True, axis = 1)
data

In [5]:
data.isnull().sum().sum()

In [6]:
scaler = StandardScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data)
data

In [7]:
def RFC(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.2, random_state = 42)
    model_rfc = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=10)    
    model_rfc.fit(train_x, train_y)
    preds = model_rfc.predict(val_x)
    score = accuracy_score(val_y, preds)
    roc_curve_score = roc_auc_score(val_y, preds)
    x, y, _ = roc_curve(val_y, preds)
    pyplot.title("Random forest classifier")
    pyplot.plot(x, y, linestyle='--')
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.show()
    print("ROC score " + str(roc_auc_score(val_y, preds)))
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))  
    print(confusion_matrix(val_y, preds))
    return score

In [8]:
def DTC(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.3, random_state = 42)
    model_dtc = DecisionTreeClassifier(max_depth=100, random_state=42)
    model_dtc.fit(train_x, train_y)
    preds = model_dtc.predict(val_x)
    score = accuracy_score(val_y, preds)
    roc_curve_score = roc_auc_score(val_y, preds)
    x, y, _ = roc_curve(val_y, preds)
    pyplot.title("Decision tree classifier")
    pyplot.plot(x, y, linestyle='--')
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.show()
    print("ROC score " + str(roc_auc_score(val_y, preds)))
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))
    print(confusion_matrix(val_y, preds))
    return score

In [9]:
def KNN(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.2, random_state = 42)
    model_knn = KNeighborsClassifier(n_neighbors=30, n_jobs=-1)
    model_knn.fit(train_x, train_y)
    preds = model_knn.predict(val_x)
    score = accuracy_score(val_y, preds)
    roc_curve_score = roc_auc_score(val_y, preds)
    x, y, _ = roc_curve(val_y, preds)
    pyplot.title("K Nearest Neighbor")
    pyplot.plot(x, y, linestyle='--')
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.show()
    print("ROC score " + str(roc_auc_score(val_y, preds)))
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))
    print(confusion_matrix(val_y, preds))
    return score

In [10]:
print("Accuracy score is " + str(RFC(data, last_row)))
print("Accuracy score is " + str(KNN(data, last_row)))
print("Accuracy score is " + str(DTC(data, last_row)))

In [11]:
pca = decomposition.PCA(n_components=30, random_state=42)
data_pca = pca.fit_transform(data)
data_pca = pd.DataFrame(data_pca)
data_pca

In [12]:
print("Accuracy score is " + str(RFC(data_pca, last_row)))
print("Accuracy score is " + str(KNN(data_pca, last_row)))
print("Accuracy score is " + str(DTC(data_pca, last_row)))

In [13]:
ica = FastICA(n_components=40, random_state=42)
data_ica = ica.fit_transform(data)
data_ica = pd.DataFrame(data_ica)

In [14]:
print("Accuracy score is " + str(RFC(data_ica, last_row)))
print("Accuracy score is " + str(KNN(data_ica, last_row)))
print("Accuracy score is " + str(DTC(data_ica, last_row)))

In [15]:
mi = sp(percentile=50)
data_mi = mi.fit(data, last_row)
data_mi = mi.transform(data)
data_mi = pd.DataFrame(data_mi)

In [16]:
print("Accuracy score is " + str(RFC(data_mi, last_row)))
print("Accuracy score is " + str(KNN(data_mi, last_row)))
print("Accuracy score is " + str(DTC(data_mi, last_row)))