In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("gene_expression_analyzed.csv")
df.head()

Unnamed: 0,Gene One,Gene Two,Cancer Present
0,4.3,3.9,1
1,2.5,6.3,0
2,5.7,3.9,1
3,6.1,6.2,0
4,7.4,3.4,1


In [3]:
X = df.drop("Cancer Present", axis = 1)
y = df["Cancer Present"]

# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

***Finding Best Random State***

In [10]:
train = []
test = []
cv = []
mae = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
    
    model = LogisticRegression()

    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)

    mae_score = mean_absolute_error(y_test, pred_test)
    cv_score = cross_val_score(model, X_train, y_train, cv = 5).mean()

    train.append(train_accuracy)
    test.append(test_accuracy)
    cv.append(cv_score)
    mae.append(mae_score)

ff = pd.DataFrame({
    "Train Model": train,
    "Test Model": test,
    "CV": cv,
    "MAE": mae
})

In [18]:
ff

Unnamed: 0,Train Model,Test Model,CV,MAE
0,0.841520,0.862150,0.839766,0.137850
1,0.843860,0.857477,0.840351,0.142523
2,0.850877,0.817757,0.849708,0.182243
3,0.849708,0.824766,0.847953,0.175234
4,0.839181,0.864486,0.840936,0.135514
...,...,...,...,...
95,0.846199,0.843458,0.845614,0.156542
96,0.851462,0.827103,0.849123,0.172897
97,0.843860,0.848131,0.843275,0.151869
98,0.844444,0.857477,0.844444,0.142523


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***GridSearchCV - SVC***

In [20]:
model = SVC()

svc_param_grid = {"C": [0.001, 0.1, 0, 10, 100], "kernel": ["linear", "poly", "sigmoid", "rbf"]}

svc_grid = GridSearchCV(model, svc_param_grid, cv = 5, scoring = "accuracy")

svc_grid.fit(X_train, y_train)

svc_grid.best_params_

{'C': 10, 'kernel': 'rbf'}

# ***KNeighborsClassifier***

In [21]:
model = KNeighborsClassifier()

knn_param_grid = {"n_neighbors": list(range(1, 20))}

knn_grid = GridSearchCV(model, knn_param_grid, cv = 5, scoring = "accuracy")

knn_grid.fit(X_train, y_train)

knn_grid.best_params_

{'n_neighbors': 17}

# ***DecisionTreeClassifier***

In [22]:
model = DecisionTreeClassifier()

decision_param_grid = {"max_depth": list(range(1, 20)), "criterion": ["gini", "entropy"]}

decision_grid = GridSearchCV(model, decision_param_grid, cv = 5, scoring = "accuracy")

decision_grid.fit(X_train, y_train)

decision_grid.best_params_

{'criterion': 'entropy', 'max_depth': 6}

# ***RandomForestClassifier***

In [23]:
model = RandomForestClassifier()

random_param_grid = {"n_estimators": list(range(1, 20))}

random_grid = GridSearchCV(model, random_param_grid, cv = 5, scoring = "accuracy")

random_grid.fit(X_train, y_train)

random_grid.best_params_

{'n_estimators': 9}

# ***AdaBoostClassifier***

In [24]:
model = AdaBoostClassifier()

ada_param_grid = {"n_estimators": list(range(1, 20))}

ada_grid = GridSearchCV(model, ada_param_grid, cv = 5, scoring = "accuracy")

ada_grid.fit(X_train, y_train)

ada_grid.best_params_

{'n_estimators': 16}

# ***GradientBoostingClassifier***

In [26]:
model = GradientBoostingClassifier()

gradient_param_grid = {"n_estimators": list(range(1, 20)), "learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gradient_grid = GridSearchCV(model, gradient_param_grid, cv = 5, scoring = "accuracy")

gradient_grid.fit(X_train, y_train)

gradient_grid.best_params_

{'learning_rate': 0.2, 'n_estimators': 14}

# ***XGBClassifier***

In [27]:
model = XGBClassifier()

xgb_param_grid = {"n_estimators": list(range(1, 20)), "learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5],
                 "gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}

xgb_grid = GridSearchCV(model, xgb_param_grid, cv = 5, scoring = "accuracy")

xgb_grid.fit(X_train, y_train)

xgb_grid.best_params_

{'gamma': 0.3, 'learning_rate': 0.5, 'n_estimators': 9}