# Preparation

In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

data = pd.read_csv('virusshare.csv', sep=',',skiprows=1, header=None).to_numpy()

X = data[:,1:]
Y = data[:,0]

def testModel(model,modelName):
    kf = KFold(n_splits=5)
    acc_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X):
        train_X, test_X = X[train_index], X[test_index]
        train_Y, test_Y = Y[train_index], Y[test_index]
        model.fit(train_X,train_Y)
        pred_values = model.predict(test_X)
        acc = accuracy_score(pred_values , test_Y)
        f1 = f1_score(pred_values , test_Y)
        acc_scores.append(acc)
        f1_scores.append(f1)

    avg_acc_score = sum(acc_scores)/5
    avg_f1_score = sum(f1_scores)/5

    print('Method: '+modelName)
    print('accuracy of each fold - {}'.format(acc_scores))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))


# Logistic Regression (Ridge)

In [14]:
from sklearn.linear_model import LogisticRegression
testModel(LogisticRegression(solver = "liblinear", penalty = "l2"),"Ridge logistic regression")

Method: Ridge logistic regression
accuracy of each fold - [0.67475, 0.695, 0.68475, 0.68, 0.685]
Avg accuracy : 0.6839000000000001
f1 of each fold - [0.6882338844955668, 0.7174617878647522, 0.6873295313662288, 0.6842624568327578, 0.6987087517934002]
Avg f1 : 0.6951992824705412


# Logistic Regression (LASSO)

In [15]:
from sklearn.linear_model import LogisticRegression
testModel(LogisticRegression(solver = "liblinear", penalty = "l1"),"LASSO logistic regression")

Method: LASSO logistic regression
accuracy of each fold - [0.6755, 0.69525, 0.6835, 0.6805, 0.684]
Avg accuracy : 0.6837500000000001
f1 of each fold - [0.6891762452107278, 0.717497103128621, 0.6872529644268774, 0.685066535239034, 0.6971729755630092]
Avg f1 : 0.6952331647136539


# Multilayer Perceptron

In [17]:
from sklearn.neural_network import MLPClassifier
testModel(MLPClassifier(solver='sgd', learning_rate = "adaptive", learning_rate_init = 0.1, alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=1,max_iter=500),"multilayer perceptron")

Method: multilayer perceptron
accuracy of each fold - [0.79775, 0.7895, 0.79875, 0.80075, 0.80125]
Avg accuracy : 0.7976
f1 of each fold - [0.7956554685526648, 0.7900249376558603, 0.7943805874840357, 0.8023803620133894, 0.8010012515644556]
Avg f1 : 0.7966885214540811


# SVM

In [None]:
from sklearn.svm import LinearSVC

testModel(LinearSVC(penalty = "l1", loss="squared_hinge",dual=False, max_iter = 10000, tol = 0.001),"SVM")

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
testModel(DecisionTreeClassifier(random_state=0),"decision tree")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
testModel(RandomForestClassifier(max_depth=300, random_state=0),"random forest")

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
testModel(GaussianNB(),"Gaussian Naive Bayes")

# k Nearest Neighbors (cosine distance)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
testModel(KNeighborsClassifier(n_neighbors=5, metric="cosine"),"kNN (cosine)")

# k Nearest Neighbors (Euclidean distance)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
testModel(KNeighborsClassifier(n_neighbors=5, metric="euclidean"),"kNN (Euclidean)")

# Ridge Logistic Regression + AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
ridge = LogisticRegression(solver = "liblinear",penalty = "l2")
testModel(AdaBoostClassifier(estimator = ridge, n_estimators=30),"ridge + AdaBoost")

# Random Forest + AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, random_state=0)
testModel(AdaBoostClassifier(estimator = rf, n_estimators=10),"random forest + AdaBoost")

# FTRL

In [None]:
import numpy as np
from datatable import Frame
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
"""

data = pd.read_csv('virusshare.csv', sep=',',skiprows=1, header=None).to_numpy()

X = data[:,1:]
Y = data[:,0]

def testFTRL(model,modelName):
    kf = KFold(n_splits=5)
    acc_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X):
        train_X, test_X = Frame(X[train_index]), Frame(X[test_index])
        train_Y, test_Y = Frame(Y[train_index]), Frame(Y[test_index])
        model.fit(train_X,train_Y)
        pred_values = model.predict(test_X).to_numpy()
        pred_values = np.rint(pred_values)
        acc = accuracy_score(pred_values, Y[test_index])
        f1 = f1_score(pred_values , Y[test_index])
        acc_scores.append(acc)
        f1_scores.append(f1)

    avg_acc_score = sum(acc_scores)/5
    avg_f1_score = sum(f1_scores)/5

    print('Method: '+modelName)
    print('accuracy of each fold - {}'.format(acc_scores))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
from datatable.models import Ftrl
testFTRL(Ftrl(),"FTRL")
"""

Since Jupyter Notebook fails to detect the datatable package, the results are listed in text form.

Method: FTRL

accuracy of each fold - [0.8065, 0.80125, 0.7935, 0.796, 0.80175]

Avg accuracy : 0.7998000000000001

f1 of each fold - [0.7913746630727764, 0.785540868626922, 0.7750544662309369, 0.7807630306286942, 0.8050159822965332]

Avg f1 : 0.7875498021711725

## Dimensionality Reduction (PCA) + Models

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

X = pd.read_csv('X_pca0.7.csv',sep=',',skiprows=1, header=None).to_numpy()
testModel(LogisticRegression(solver = "liblinear", penalty = "l2"),"Ridge logistic regression")
testModel(LogisticRegression(solver = "liblinear", penalty = "l1"),"LASSO logistic regression")
testModel(MLPClassifier(solver='sgd', learning_rate = "adaptive", learning_rate_init = 0.1, alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=1,max_iter=500),"multilayer perceptron")
testModel(LinearSVC(penalty = "l1", loss="squared_hinge",dual=False, max_iter = 10000, tol = 0.001),"SVM")
testModel(DecisionTreeClassifier(random_state=0),"decision tree")
testModel(RandomForestClassifier(max_depth=300, random_state=0),"random forest")
testModel(GaussianNB(),"Gaussian Naive Bayes")
testModel(KNeighborsClassifier(n_neighbors=5, metric="cosine"),"kNN (cosine)")
testModel(KNeighborsClassifier(n_neighbors=5, metric="euclidean"),"kNN (Euclidean)")

ridge = LogisticRegression(solver = "liblinear",penalty = "l2")
testModel(AdaBoostClassifier(estimator = ridge, n_estimators=30),"ridge + AdaBoost")

rf = RandomForestClassifier(max_depth=10, random_state=0)
testModel(AdaBoostClassifier(estimator = rf, n_estimators=10),"random forest + AdaBoost")

Method: Ridge logistic regression
accuracy of each fold - [0.67475, 0.695, 0.68475, 0.68, 0.685]
Avg accuracy : 0.6839000000000001
f1 of each fold - [0.6882338844955668, 0.7174617878647522, 0.6873295313662288, 0.6842624568327578, 0.6987087517934002]
Avg f1 : 0.6951992824705412
Method: LASSO logistic regression
accuracy of each fold - [0.6755, 0.69525, 0.6835, 0.6805, 0.684]
Avg accuracy : 0.6837500000000001
f1 of each fold - [0.6891762452107278, 0.717497103128621, 0.6872529644268774, 0.685066535239034, 0.6971729755630092]
Avg f1 : 0.6952331647136539
