In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

data = pd.read_excel("Clean genomic dataset.xlsx")

<h2 id="load_dataset">Virus Zoonotic Potential Model (AdaBoost)</h2>

In [25]:
data['Virus family'] = LabelEncoder().fit_transform(data['Virus family'])
X = data[['Virus family','vSegmentedTF (True:1)', 'Vector-borne or not (True:1)', 'enveloped/non-enveloped (True:1)', 
          'DNA(0)/RNA(1)', 'Replication in the cytoplasm (True:1)','Average genome length (Nucleotides)']].values
Y = data['Zoonotic or not (True:1)'].values

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

In [27]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [28]:
M1 = AdaBoostClassifier(n_estimators=20,algorithm='SAMME').fit(x_train,y_train)
M1

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
                   n_estimators=20, random_state=None)

In [29]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score


In [30]:
yhat=M1.predict(x_test)

print("Test set ROC_AUC is: ", roc_auc_score(y_test, yhat))
print("Test set Accuracy: ", accuracy_score(y_test, yhat))
print("The jaccard score is", jaccard_similarity_score(y_test, yhat))
print('The F1 score is', f1_score(y_test, yhat, average='weighted'))

Test set ROC_AUC is:  0.7730978260869565
Test set Accuracy:  0.8
The jaccard score is 0.8
The F1 score is 0.7919104369808596




<h2 id="load_dataset">Virus Class Model (KMeans)</h2>

In [31]:
from sklearn import preprocessing
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 1.42843054, -0.57381904,  1.55809915, -0.64180768,  0.71100272,
         0.7051615 , -0.41533527],
       [ 0.88096595, -0.57381904, -0.64180768,  1.55809915, -1.40646439,
         0.7051615 , -0.57899763],
       [ 1.29156439, -0.57381904, -0.64180768, -0.64180768,  0.71100272,
        -1.41811485, -0.44185147],
       [ 1.15469824,  1.74270968,  1.55809915,  1.55809915,  0.71100272,
         0.7051615 , -0.33534937],
       [-1.44575855, -0.57381904,  1.55809915, -0.64180768, -1.40646439,
         0.7051615 ,  2.30404522]])

In [32]:
from sklearn.cluster import KMeans 

In [33]:
M2 = KMeans(init = "k-means++",n_clusters = 2 , n_init = 12).fit(X)
M2


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=12, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

<h1 align=center><font size="4"> Now, these two Supervised and Unsupervised models' weights (M1 in which 1 is for zoonotic and M2 in which 1 is for high risk respectively) are to deployed onto our web portal </font></h1>