In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn import tree
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Normalizing data
df= pd.read_csv('Processed_data.csv')
scaler= StandardScaler()
scale_pararms=[float(df['AGE'].mean()), float(df['AGE'].std())]
df['AGE']= scaler.fit_transform(df[['AGE']])
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,0.771850,1,2,1,1,2,1,2,2,2,2,2,2,1
1,0,1.381829,2,1,1,2,2,2,1,1,1,2,2,2,1
2,1,-0.448107,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,0.039876,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,0.039876,1,2,1,1,1,1,2,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,1,-0.814095,1,1,2,2,2,1,1,2,2,2,2,1,1
305,0,0.893846,2,1,1,1,2,2,2,2,2,2,1,2,1
306,0,-0.570103,2,1,1,1,1,2,2,2,2,1,1,2,1
307,0,0.527859,2,1,1,1,2,2,1,2,2,2,1,2,1


In [5]:
# Dealing with data imbalance
X_arr= df.drop(columns='LUNG_CANCER')
Y_arr= df['LUNG_CANCER']
X_oversample, Y_oversample= RandomOverSampler().fit_resample(X_arr, Y_arr)
X_oversample.shape, Y_oversample.shape

((540, 14), (540,))

In [14]:
# Training models Logistic reg, Decision Trees and Random Forest
X_train, X_test, Y_train, Y_test= train_test_split(X_oversample, Y_oversample, test_size=0.2)

Model1= LogisticRegression()
Model2= DecisionTreeClassifier()
Model3= RandomForestClassifier()

par_M1= {'max_iter':[100,500,1000,2000],'C':[0.1,0.5,1,5,10,50,100]}
par_M2= {'criterion':['gini','entropy','log_loss'], 'max_depth':[1,3,5,10,20],'ccp_alpha':[0,0.1,0.5,1,5,10,20]}
par_M3= {'criterion':['gini','entropy','log_loss'], 'max_depth':[1,3,5,10,20],'ccp_alpha':[0,0.1,0.5,1,5,10,20],'n_estimators':[10,50,100,200]}

cv_M1= GridSearchCV(Model1,param_grid=par_M1, cv=5, scoring='accuracy')
cv_M2= GridSearchCV(Model2, param_grid=par_M2,cv=5, scoring='accuracy')
cv_M3= GridSearchCV(Model3, param_grid=par_M3,cv=5, scoring='accuracy')



In [10]:
# Logistic Reg
cv_M1.fit(X_train, Y_train)
pred_M1=cv_M1.predict(X_test)
print("accuracy score",accuracy_score(Y_test, pred_M1))
print(classification_report(Y_test, pred_M1))
print("best_cv score", cv_M1.best_score_)

accuracy score 0.8796296296296297
              precision    recall  f1-score   support

           0       0.88      0.92      0.90        61
           1       0.89      0.83      0.86        47

    accuracy                           0.88       108
   macro avg       0.88      0.87      0.88       108
weighted avg       0.88      0.88      0.88       108

best_cv score 0.9027800053461641


In [11]:
# Decision Trees
cv_M2.fit(X_train, Y_train)
pred_M2=cv_M2.predict(X_test)
print("accuracy score",accuracy_score(Y_test, pred_M2))
print(classification_report(Y_test, pred_M2))
print("best_cv score", cv_M2.best_score_)


accuracy score 0.9537037037037037
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        61
           1       1.00      0.89      0.94        47

    accuracy                           0.95       108
   macro avg       0.96      0.95      0.95       108
weighted avg       0.96      0.95      0.95       108

best_cv score 0.9582999198075381


In [15]:
# Random Forest
cv_M3.fit(X_train, Y_train)
pred_M3=cv_M3.predict(X_test)
print("accuracy score",accuracy_score(Y_test, pred_M3))
print(classification_report(Y_test, pred_M3))
print("best_cv score", cv_M3.best_score_)


accuracy score 0.9722222222222222
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        52
           1       1.00      0.95      0.97        56

    accuracy                           0.97       108
   macro avg       0.97      0.97      0.97       108
weighted avg       0.97      0.97      0.97       108

best_cv score 0.9675220529270249


In [None]:
import pickle
file=open("Final_model.pickle",'w') 
file.close()
file= open("Final_model.pickle",'wb')
# using the Random Forest algorithm
pickle.dump(cv_M3,file)