In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [3]:
df = pd.read_csv("gene_expression_analyzed.csv")
df.head()

Unnamed: 0,Gene One,Gene Two,Cancer Present
0,4.3,3.9,1
1,2.5,6.3,0
2,5.7,3.9,1
3,6.1,6.2,0
4,7.4,3.4,1


In [4]:
X = df.drop("Cancer Present", axis = 1)
y = df["Cancer Present"]

# ***Libraries***

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

***Train_Test_Split***

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***LogisticRegression***

In [47]:
model = LogisticRegression()

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.8415204678362573
Testing Accuracy : 0.8621495327102804
CV Score : 0.839766081871345
MAE : 0.1378504672897196


In [9]:
confusion_matrix(y_test, pred_test)

array([[189,  21],
       [ 46, 172]], dtype=int64)

In [10]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       210
           1       0.89      0.79      0.84       218

    accuracy                           0.84       428
   macro avg       0.85      0.84      0.84       428
weighted avg       0.85      0.84      0.84       428



# ***SVM - SVC***

In [11]:
model = SVC(C = 10, kernel = "rbf")

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9198830409356725
Testing Accuracy : 0.9158878504672897
CV Score : 0.9192982456140351
MAE : 0.08411214953271028


In [12]:
confusion_matrix(y_test, pred_test)

array([[201,   9],
       [ 27, 191]], dtype=int64)

In [13]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       210
           1       0.95      0.88      0.91       218

    accuracy                           0.92       428
   macro avg       0.92      0.92      0.92       428
weighted avg       0.92      0.92      0.92       428



# ***KNeighborsClassifier***

In [14]:
model = KNeighborsClassifier(n_neighbors = 17)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9239766081871345
Testing Accuracy : 0.9088785046728972
CV Score : 0.9169590643274853
MAE : 0.0911214953271028


In [15]:
confusion_matrix(y_test, pred_test)

array([[199,  11],
       [ 28, 190]], dtype=int64)

In [16]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       210
           1       0.95      0.87      0.91       218

    accuracy                           0.91       428
   macro avg       0.91      0.91      0.91       428
weighted avg       0.91      0.91      0.91       428



# ***DecisionTreeClassifier***

In [17]:
model = DecisionTreeClassifier(max_depth = 6, criterion = "entropy", random_state = 1)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9128654970760234
Testing Accuracy : 0.8714953271028038
CV Score : 0.9134502923976608
MAE : 0.12850467289719625


In [18]:
confusion_matrix(y_test, pred_test)

array([[195,  15],
       [ 40, 178]], dtype=int64)

In [19]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       210
           1       0.92      0.82      0.87       218

    accuracy                           0.87       428
   macro avg       0.88      0.87      0.87       428
weighted avg       0.88      0.87      0.87       428



# ***RandomForestClassifier***

In [20]:
model = RandomForestClassifier(n_estimators = 9, random_state = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9643274853801169
Testing Accuracy : 0.8574766355140186
CV Score : 0.8859649122807017
MAE : 0.1425233644859813


In [21]:
confusion_matrix(y_test, pred_test)

array([[190,  20],
       [ 41, 177]], dtype=int64)

In [22]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       210
           1       0.90      0.81      0.85       218

    accuracy                           0.86       428
   macro avg       0.86      0.86      0.86       428
weighted avg       0.86      0.86      0.86       428



# ***AdaBoostClassifier***

In [23]:
model = AdaBoostClassifier(n_estimators = 16, random_state = 0)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9111111111111111
Testing Accuracy : 0.8855140186915887
CV Score : 0.9005847953216375
MAE : 0.11448598130841121


In [24]:
confusion_matrix(y_test, pred_test)

array([[200,  10],
       [ 39, 179]], dtype=int64)

In [25]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89       210
           1       0.95      0.82      0.88       218

    accuracy                           0.89       428
   macro avg       0.89      0.89      0.89       428
weighted avg       0.89      0.89      0.89       428



# ***GradientBoostingClassifier***

In [26]:
model = GradientBoostingClassifier(n_estimators = 14, random_state = 0, learning_rate = 0.2)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9228070175438596
Testing Accuracy : 0.9182242990654206
CV Score : 0.9152046783625731
MAE : 0.08177570093457943


In [27]:
confusion_matrix(y_test, pred_test)

array([[199,  11],
       [ 24, 194]], dtype=int64)

In [28]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       210
           1       0.95      0.89      0.92       218

    accuracy                           0.92       428
   macro avg       0.92      0.92      0.92       428
weighted avg       0.92      0.92      0.92       428



# ***XGBClassifier***

In [29]:
model = XGBClassifier(n_estimators = 9, random_state = 0, learning_rate = 0.3, gamma = 0.5)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9292397660818713
Testing Accuracy : 0.897196261682243
CV Score : 0.9140350877192983
MAE : 0.102803738317757


In [30]:
confusion_matrix(y_test, pred_test)

array([[202,   8],
       [ 36, 182]], dtype=int64)

In [31]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90       210
           1       0.96      0.83      0.89       218

    accuracy                           0.90       428
   macro avg       0.90      0.90      0.90       428
weighted avg       0.90      0.90      0.90       428



# ***FINAL MODEL***

In [48]:
model = GradientBoostingClassifier(n_estimators = 14, random_state = 0, learning_rate = 0.2)

model.fit(X_train, y_train)
pred_test = model.predict(X_test)
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))
print("CV Score :", cross_val_score(model, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.92046783625731
Testing Accuracy : 0.9322429906542056
CV Score : 0.9140350877192981
MAE : 0.06775700934579439


In [37]:
df.head(1)

Unnamed: 0,Gene One,Gene Two,Cancer Present
0,4.3,3.9,1


In [34]:
import pickle
with open("gene_gradient_model.pkl", "wb") as file:
    pickle.dump(model, file)

In [35]:
with open("gene_gradient_model.pkl", "rb") as file:
    loaded = pickle.load(file)

In [45]:
final = pd.DataFrame({
    "Gene One":5.3,
    "Gene Two":8.9
}, index = [0])

output = loaded.predict(final)[0]
if output == 1:
    print(f"It seems cancer is present.")
else:
    print("Cancer isn't present. You are safe !")

Cancer isn't present. You are safe !
