In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [3]:
df = pd.read_csv("mouse_viral_study.csv")
df.head()

Unnamed: 0,Med_1_mL,Med_2_mL,Virus Present
0,6.508231,8.582531,0
1,4.126116,3.073459,1
2,6.42787,6.369758,0
3,3.672953,4.905215,1
4,1.580321,2.440562,1


In [4]:
X = df.drop("Virus Present", axis = 1)
y = df["Virus Present"]

# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

***Finding Best Random State***

In [6]:
train = []
test = []
cv = []
mae = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
    
    model = LogisticRegression()

    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)

    mae_score = mean_absolute_error(y_test, pred_test)
    cv_score = cross_val_score(model, X_train, y_train, cv = 5).mean()

    train.append(train_accuracy)
    test.append(test_accuracy)
    cv.append(cv_score)
    mae.append(mae_score)

ff = pd.DataFrame({
    "Train Model": train,
    "Test Model": test,
    "CV": cv,
    "MAE": mae
})

In [7]:
ff

Unnamed: 0,Train Model,Test Model,CV,MAE
0,1.0,1.0,1.0,0.0
1,1.0,1.0,1.0,0.0
2,1.0,1.0,1.0,0.0
3,1.0,1.0,1.0,0.0
4,1.0,1.0,1.0,0.0
...,...,...,...,...
95,1.0,1.0,1.0,0.0
96,1.0,1.0,1.0,0.0
97,1.0,1.0,1.0,0.0
98,1.0,1.0,1.0,0.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***GridSearchCV - SVC***

In [9]:
model = SVC()

svc_param_grid = {"C": [0.001, 0.1, 0, 10, 100], "kernel": ["linear", "poly", "sigmoid", "rbf"]}

svc_grid = GridSearchCV(model, svc_param_grid, cv = 5, scoring = "accuracy")

svc_grid.fit(X_train, y_train)

svc_grid.best_params_

{'C': 0.001, 'kernel': 'linear'}

# ***KNeighborsClassifier***

In [10]:
model = KNeighborsClassifier()

knn_param_grid = {"n_neighbors": list(range(1, 20))}

knn_grid = GridSearchCV(model, knn_param_grid, cv = 5, scoring = "accuracy")

knn_grid.fit(X_train, y_train)

knn_grid.best_params_

{'n_neighbors': 1}

# ***DecisionTreeClassifier***

In [11]:
model = DecisionTreeClassifier()

decision_param_grid = {"max_depth": list(range(1, 20)), "criterion": ["gini", "entropy"]}

decision_grid = GridSearchCV(model, decision_param_grid, cv = 5, scoring = "accuracy")

decision_grid.fit(X_train, y_train)

decision_grid.best_params_

{'criterion': 'gini', 'max_depth': 2}

# ***RandomForestClassifier***

In [12]:
model = RandomForestClassifier()

random_param_grid = {"n_estimators": list(range(1, 20))}

random_grid = GridSearchCV(model, random_param_grid, cv = 5, scoring = "accuracy")

random_grid.fit(X_train, y_train)

random_grid.best_params_

{'n_estimators': 7}

# ***AdaBoostClassifier***

In [13]:
model = AdaBoostClassifier()

ada_param_grid = {"n_estimators": list(range(1, 20))}

ada_grid = GridSearchCV(model, ada_param_grid, cv = 5, scoring = "accuracy")

ada_grid.fit(X_train, y_train)

ada_grid.best_params_

{'n_estimators': 2}

# ***GradientBoostingClassifier***

In [14]:
model = GradientBoostingClassifier()

gradient_param_grid = {"n_estimators": list(range(1, 20)), "learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gradient_grid = GridSearchCV(model, gradient_param_grid, cv = 5, scoring = "accuracy")

gradient_grid.fit(X_train, y_train)

gradient_grid.best_params_

{'learning_rate': 0.1, 'n_estimators': 1}

# ***XGBClassifier***

In [15]:
model = XGBClassifier()

xgb_param_grid = {"n_estimators": list(range(1, 20)), "learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5],
                 "gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}

xgb_grid = GridSearchCV(model, xgb_param_grid, cv = 5, scoring = "accuracy")

xgb_grid.fit(X_train, y_train)

xgb_grid.best_params_

{'gamma': 0, 'learning_rate': 0.1, 'n_estimators': 17}