# Preparation

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

data = pd.read_csv('virusshare.csv', sep=',',skiprows=1, header=None).to_numpy()

X = data[:,1:]
Y = data[:,0]

def testModel(model,modelName):
    kf = KFold(n_splits=5)
    acc_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X):
        train_X, test_X = X[train_index], X[test_index]
        train_Y, test_Y = Y[train_index], Y[test_index]
        model.fit(train_X,train_Y)
        pred_values = model.predict(test_X)
        acc = accuracy_score(pred_values , test_Y)
        f1 = f1_score(pred_values , test_Y)
        acc_scores.append(acc)
        f1_scores.append(f1)

    avg_acc_score = sum(acc_scores)/5
    avg_f1_score = sum(f1_scores)/5

    print('Method: '+modelName)
    print('accuracy of each fold - {}'.format(acc_scores))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))


# Logistic Regression (Ridge)

In [29]:
from sklearn.linear_model import LogisticRegression
testModel(LogisticRegression(solver = "liblinear", penalty = "l2"),"Ridge logistic regression")

Method: Ridge logistic regression
accuracy of each fold - [0.73825, 0.73475, 0.7515, 0.74925, 0.7655]
Avg accuracy : 0.7478499999999999
f1 of each fold - [0.7389678384442783, 0.7562600505398576, 0.7607125662012517, 0.7631641086186541, 0.7759197324414715]
Avg f1 : 0.7590048592491028


# Logistic Regression (LASSO)

In [30]:
from sklearn.linear_model import LogisticRegression
testModel(LogisticRegression(solver = "liblinear", penalty = "l1"),"LASSO logistic regression")

Method: LASSO logistic regression
accuracy of each fold - [0.73825, 0.7345, 0.75225, 0.74625, 0.7655]
Avg accuracy : 0.74735
f1 of each fold - [0.7388376153654279, 0.7561983471074379, 0.7621790256779459, 0.7597633136094674, 0.7762404580152671]
Avg f1 : 0.7586437519551092


# Multilayer Perceptron

In [8]:
from sklearn.neural_network import MLPClassifier
testModel(MLPClassifier(solver='sgd', learning_rate = "adaptive", learning_rate_init = 0.1, alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=1,max_iter=500),"multilayer perceptron")

Method: multilayer perceptron
accuracy of each fold - [0.8145, 0.811, 0.8125, 0.81, 0.82425]
Avg accuracy : 0.8144500000000001
f1 of each fold - [0.8070722828913156, 0.8001057641459545, 0.7988197424892705, 0.8028022833419823, 0.814070351758794]
Avg f1 : 0.8045740849254633


# SVM

In [20]:
from sklearn.svm import LinearSVC

testModel(LinearSVC(penalty = "l1", loss="squared_hinge",dual=False, max_iter = 10000, tol = 0.001),"SVM")

Method: SVM
accuracy of each fold - [0.72825, 0.72975, 0.749, 0.742, 0.76425]
Avg accuracy : 0.74265
f1 of each fold - [0.7300720139061336, 0.7537024379129643, 0.7571359458151911, 0.755334281650071, 0.7746714456391874]
Avg f1 : 0.7541832249847096


# Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
testModel(DecisionTreeClassifier(random_state=0),"decision tree")

Method: decision tree
accuracy of each fold - [0.81975, 0.8125, 0.82275, 0.81975, 0.82325]
Avg accuracy : 0.8196
f1 of each fold - [0.8180671208680294, 0.8095479939055358, 0.8200050774308201, 0.8189806678383128, 0.823028785982478]
Avg f1 : 0.8179259292050352


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
testModel(RandomForestClassifier(max_depth=300, random_state=0),"random forest")

Method: random forest
accuracy of each fold - [0.8435, 0.8415, 0.84675, 0.85375, 0.84625]
Avg accuracy : 0.8463499999999999
f1 of each fold - [0.8418393127842345, 0.8374358974358974, 0.8439011968423733, 0.8523100227215349, 0.8455162019593067]
Avg f1 : 0.8442005263486694


# Gaussian Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
testModel(GaussianNB(),"Gaussian Naive Bayes")

Method: Gaussian Naive Bayes
accuracy of each fold - [0.58675, 0.57825, 0.58425, 0.61075, 0.58125]
Avg accuracy : 0.58825
f1 of each fold - [0.32777551850345665, 0.31283095723014254, 0.33824114604058897, 0.42609657206044965, 0.34132913881242627]
Avg f1 : 0.34925466652941284


# k Nearest Neighbors (cosine distance)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
testModel(KNeighborsClassifier(n_neighbors=5, metric="cosine"),"kNN (cosine)")

Method: kNN (cosine)
accuracy of each fold - [0.829, 0.8265, 0.827, 0.83425, 0.8315]
Avg accuracy : 0.82965
f1 of each fold - [0.8298507462686568, 0.8231396534148827, 0.8231987736331119, 0.8336260978670011, 0.8325049701789264]
Avg f1 : 0.8284640482725159


# k Nearest Neighbors (Euclidean distance)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
testModel(KNeighborsClassifier(n_neighbors=5, metric="euclidean"),"kNN (Euclidean)")

Method: kNN (Euclidean)
accuracy of each fold - [0.8305, 0.8225, 0.823, 0.827, 0.827]
Avg accuracy : 0.826
f1 of each fold - [0.8274809160305342, 0.820344129554656, 0.8212121212121212, 0.8287976249381495, 0.8288822947576656]
Avg f1 : 0.8253434172986254


# Ridge Logistic Regression + AdaBoost

In [36]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
ridge = LogisticRegression(solver = "liblinear",penalty = "l2")
testModel(AdaBoostClassifier(estimator = ridge, n_estimators=30),"ridge + AdaBoost")

Method: ridge + AdaBoost
accuracy of each fold - [0.7365, 0.72725, 0.7345, 0.73925, 0.7345]
Avg accuracy : 0.7344
f1 of each fold - [0.7484486873508353, 0.7489067894131184, 0.7465393794749402, 0.7569331158238174, 0.7372587827808016]
Avg f1 : 0.7476173509687026


# Random Forest + AdaBoost

In [4]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, random_state=0)
testModel(AdaBoostClassifier(estimator = rf, n_estimators=10),"random forest + AdaBoost")

Method: random forest + AdaBoost
accuracy of each fold - [0.8545, 0.8495, 0.85325, 0.85, 0.859]
Avg accuracy : 0.8532500000000001
f1 of each fold - [0.8500772797527048, 0.845005149330587, 0.8484379034340304, 0.8474059003051881, 0.8578629032258064]
Avg f1 : 0.8497578272096632


# FTRL

In [3]:
! pip install datatable
from datatable.models import Ftrl
testModel(Ftrl(),"FTRL")

  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [29 lines of output]
  Traceback (most recent call last):
    File "D:\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 144, in prepare_metadata_for_build_wheel
      hook = backend.prepare_metadata_for_build_wheel
  AttributeError: module 'ext' has no attribute 'prepare_metadata_for_build_wheel'
  
  During handling of the above exception, another exception occurred:
  
  Traceback (most recent call last):
    File "D:\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 351, in <module>
      main()
    File "D:\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 333, in main
      json_out['return_val'] = hook(**hook_input['kwargs'])
    File "D:\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 148, in prepare_metadata_for_build_wheel
      whl_

Collecting datatable
  Using cached datatable-1.0.0.tar.gz (1.1 MB)
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'error'


ModuleNotFoundError: No module named 'datatable'