In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [3]:
df = pd.read_csv("mouse_viral_study.csv")
df.head()

Unnamed: 0,Med_1_mL,Med_2_mL,Virus Present
0,6.508231,8.582531,0
1,4.126116,3.073459,1
2,6.42787,6.369758,0
3,3.672953,4.905215,1
4,1.580321,2.440562,1


In [4]:
X = df.drop("Virus Present", axis = 1)
y = df["Virus Present"]

# ***Libraries***

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

***Train_Test_Split***

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***LogisticRegression***

In [8]:
model = LogisticRegression()

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 1.0
MAE : 0.0


In [9]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [10]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***SVM - SVC***

In [11]:
model = SVC(C = 0.001, kernel = "linear")

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 1.0
MAE : 0.0


In [12]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [13]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***KNeighborsClassifier***

In [14]:
model = KNeighborsClassifier(n_neighbors = 1)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 1.0
MAE : 0.0


In [15]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [16]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***DecisionTreeClassifier***

In [25]:
model = DecisionTreeClassifier(max_depth = 2, criterion = "gini", random_state = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9875
CV Score : 0.996875
MAE : 0.0125


In [26]:
confusion_matrix(y_test, pred_test)

array([[34,  1],
       [ 0, 45]], dtype=int64)

In [27]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.98      1.00      0.99        45

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



# ***RandomForestClassifier***

In [28]:
model = RandomForestClassifier(n_estimators = 7, random_state = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 0.99375
MAE : 0.0


In [29]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [30]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***AdaBoostClassifier***

In [31]:
model = AdaBoostClassifier(n_estimators = 2, random_state = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 0.996875
MAE : 0.0


In [32]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [33]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***GradientBoostingClassifier***

In [38]:
model = GradientBoostingClassifier(n_estimators = 1, random_state = 0, learning_rate = 0.1)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9875
CV Score : 0.996875
MAE : 0.0125


In [35]:
confusion_matrix(y_test, pred_test)

array([[34,  1],
       [ 0, 45]], dtype=int64)

In [36]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.98      1.00      0.99        45

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



# ***XGBClassifier***

In [39]:
model = XGBClassifier(n_estimators = 17, random_state = 0, learning_rate = 0.1, gamma = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 0.996875
MAE : 0.0


In [40]:
confusion_matrix(y_test, pred_test)

array([[35,  0],
       [ 0, 45]], dtype=int64)

In [41]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        45

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ***FINAL MODEL***
- Logistic Regression, SVM, KNeighborsClassifier, RandomForestClassifier, AdaBoostClassifier,XGBClassifier are providing the 0.0% error.
- They also provide 100% train and test accuracy as well as 100% cross validation score.

In [42]:
model = LogisticRegression()

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 1.0
MAE : 0.0


In [48]:
df.head(5)

Unnamed: 0,Med_1_mL,Med_2_mL,Virus Present
0,6.508231,8.582531,0
1,4.126116,3.073459,1
2,6.42787,6.369758,0
3,3.672953,4.905215,1
4,1.580321,2.440562,1


In [45]:
import pickle
with open("mouse_logistic_model.pkl", "wb") as file:
    pickle.dump(model, file)

In [46]:
with open("mouse_logistic_model.pkl", "rb") as file:
    loaded = pickle.load(file)

In [50]:
final = pd.DataFrame({
    "Med_1_mL":4.126116,
    "Med_2_mL":3.073459
}, index = [0])

output = loaded.predict(final)[0]
if output == 1:
    print("It seems virus is present.")
else:
    print("Virus isn't present. You are safe !")

It seems virus is present.
