In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [2]:
dataset = pd.read_csv("Pre_Lungcancer.csv", index_col=None)
df1 = dataset
df1 = pd.get_dummies(df1, drop_first=True)

indep_X = df1[['GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN']]
dep_Y = df1['LUNG_CANCER']

In [3]:
dataset

Unnamed: 0.1,Unnamed: 0,GENOMIC SEX,AGE,BLOOD_TYPE,NUMBER_OF_SIBLINGS,PARENT_ALIVE,SMOKING_STATUS,DAILY_CIGARETTES,YELLOW_SKIN,ANXIETY,...,COPD_DIAGNOSES,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,0,1.0,69,0.0,5.0,3,1,17.0,2,2,...,1.0,2.0,1,2.0,2,2,2.0,2,2,2.0
1,1,1.0,74,1.0,1.0,2,2,29.0,1,1,...,2.0,2.0,2,1.0,1,1,2.0,2,2,2.0
2,2,0.0,59,2.0,0.0,2,1,17.0,1,1,...,1.0,2.0,1,2.0,1,2,2.0,1,2,1.0
3,3,1.0,63,3.0,3.0,2,2,20.0,2,2,...,1.0,1.0,1,1.0,2,1,1.0,2,2,1.0
4,4,0.0,63,0.0,4.0,1,1,17.0,2,1,...,1.0,1.0,1,2.0,1,2,2.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,1115,0.0,57,0.0,3.0,3,1,17.0,1,1,...,2.0,2.0,1,2.0,2,2,1.0,1,1,1.0
1116,1116,0.0,51,0.0,2.0,2,2,21.0,1,2,...,1.0,1.0,1,1.0,1,1,2.0,1,2,1.0
1117,1117,0.0,65,0.0,1.0,1,1,17.0,2,1,...,1.0,2.0,1,2.0,1,2,1.0,1,1,1.0
1118,1118,0.0,57,0.0,2.0,1,2,21.0,1,2,...,2.0,2.0,1,2.0,1,2,1.0,1,1,1.0


In [4]:
dataset.columns

Index(['Unnamed: 0', 'GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
      dtype='object')

In [5]:
# Drop all unnamed columns
df1.drop([col for col in df1.columns if 'Unnamed: 0' in col], axis=1, inplace=True)
df1

Unnamed: 0,GENOMIC SEX,AGE,BLOOD_TYPE,NUMBER_OF_SIBLINGS,PARENT_ALIVE,SMOKING_STATUS,DAILY_CIGARETTES,YELLOW_SKIN,ANXIETY,PEER_PRESSURE,COPD_DIAGNOSES,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1.0,69,0.0,5.0,3,1,17.0,2,2,1,1.0,2.0,1,2.0,2,2,2.0,2,2,2.0
1,1.0,74,1.0,1.0,2,2,29.0,1,1,1,2.0,2.0,2,1.0,1,1,2.0,2,2,2.0
2,0.0,59,2.0,0.0,2,1,17.0,1,1,2,1.0,2.0,1,2.0,1,2,2.0,1,2,1.0
3,1.0,63,3.0,3.0,2,2,20.0,2,2,1,1.0,1.0,1,1.0,2,1,1.0,2,2,1.0
4,0.0,63,0.0,4.0,1,1,17.0,2,1,1,1.0,1.0,1,2.0,1,2,2.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,0.0,57,0.0,3.0,3,1,17.0,1,1,1,2.0,2.0,1,2.0,2,2,1.0,1,1,1.0
1116,0.0,51,0.0,2.0,2,2,21.0,1,2,1,1.0,1.0,1,1.0,1,1,2.0,1,2,1.0
1117,0.0,65,0.0,1.0,1,1,17.0,2,1,1,1.0,2.0,1,2.0,1,2,1.0,1,1,1.0
1118,0.0,57,0.0,2.0,1,2,21.0,1,2,1,2.0,2.0,1,2.0,1,2,1.0,1,1,1.0


In [6]:
# Display the initial number of rows
print(f"Initial number of rows: {dataset.shape[0]}")

# Remove duplicate rows
dataset_cleaned = dataset.drop_duplicates()

Initial number of rows: 1120


In [7]:
# Display the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {dataset_cleaned.shape[0]}")

# Optionally, reset the index if needed
dataset_cleaned.reset_index(drop=True, inplace=True)

Number of rows after removing duplicates: 1120


In [8]:
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

In [9]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [10]:
def Linear(X_train,y_train,X_test):       
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [11]:
def Decision(X_train,y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [12]:
def random(X_train,y_train,X_test):       
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2

In [13]:
def xgboost(X_train,y_train,X_test):       
    from xgboost import XGBRegressor
    regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return r2

In [14]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []  
    # List to store column names for each model
    r2_values = []  
    # List to store R2 values for each model

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature), dep_Y)
        model.fit(X_train, y_train)  # Fit the model
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: Linear
Selected Columns: ['ANXIETY', 'FATIGUE', 'ALLERGY', 'ALCOHOL_CONSUMPTION', 'SWALLOWING_DIFFICULTY']
R2 Value: 0.10275924009396886

Model: Decision
Selected Columns: ['AGE', 'NUMBER_OF_SIBLINGS', 'PARENT_ALIVE', 'DAILY_CIGARETTES', 'SWALLOWING_DIFFICULTY']
R2 Value: -0.47690452867153943

Model: Random
Selected Columns: ['AGE', 'NUMBER_OF_SIBLINGS', 'PARENT_ALIVE', 'DAILY_CIGARETTES', 'PEER_PRESSURE']
R2 Value: 0.01297978047371462

Model: XGBoost
Selected Columns: ['AGE', 'COPD_DIAGNOSES', 'ALLERGY', 'SHORTNESS_OF_BREATH', 'SWALLOWING_DIFFICULTY']
R2 Value: 0.1861826865259968



# Model creation

# SVM - Linear(Kernel)

In [15]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve

# Load and preprocess the dataset
dataset = pd.read_csv("Pre_Lungcancer.csv")

indep_X = df1[['GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN']]
dep_Y = df1['LUNG_CANCER']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

# Set up the parameter grid for SVM classifier
param_grid = {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]}

# Perform grid search with cross-validation
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(X_train, y_train)

# Predict on test set
y_predict = grid.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_predict)
conf_matrix = confusion_matrix(y_test, y_predict)
clf_report = classification_report(y_test, y_predict)
#roc_auc = roc_auc_score(y_test, y_prob)  # Calculate AUROC

print("Best parameters found by GridSearchCV:", grid.best_params_)
print("Accuracy on test set:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", clf_report)
#print("AUROC score on test set:", roc_auc)

# Save the best model
filename = "SVM_Lungcancer_finalmodel.sav"
pickle.dump(grid, open(filename, 'wb'))

# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))

def get_reg_prediction(prediction):
    return "Yes" if prediction >= 1 else "No"

# Example input values
GENOMIC_SEX = int(input("Gender (0 for F, 1 for M): "))
AGE = float(input("Age: "))
BLOOD_TYPE = float(input("BLOOD_TYPE : "))
NUMBER_OF_SIBLINGS = int(input("FamilyHistory LC: "))
PARENT_ALIVE = int(input("PARENT_DeathwithReason: "))
SMOKING_STATUS = float(input("SMOKING_STATUS: "))
DAILY_CIGARETTES = float(input("DAILY_CIGARETTES: "))
YELLOW_SKIN = float(input("YELLOW_SKIN: "))
ANXIETY = float(input("ANXIETY: "))
PEER_PRESSURE = float(input("PEER_PRESSURE: "))
COPD_DIAGNOSES = float(input("COPD_DIAGNOSES: "))
FATIGUE = float(input("FATIGUE: "))
ALLERGY = float(input("ALLERGY: "))
WHEEZING = float(input("WHEEZING: "))
ALCOHOL_CONSUMPTION = float(input("ALCOHOL_CONSUMPTION: "))
COUGHING = float(input("COUGHING: "))
SHORTNESS_OF_BREATH = float(input("SHORTNESS_OF_BREATH: "))
SWALLOWING_DIFFICULTY = float(input("SWALLOWING_DIFFICULTY: "))
CHEST_PAIN = float(input("CHEST_PAIN: "))

# Make prediction
future_prediction = loaded_model.predict([[GENOMIC_SEX, AGE, BLOOD_TYPE, NUMBER_OF_SIBLINGS, PARENT_ALIVE, SMOKING_STATUS,
                                           DAILY_CIGARETTES, YELLOW_SKIN, ANXIETY, PEER_PRESSURE, COPD_DIAGNOSES, FATIGUE,
                                           ALLERGY, WHEEZING, ALCOHOL_CONSUMPTION, COUGHING, SHORTNESS_OF_BREATH,
                                           SWALLOWING_DIFFICULTY, CHEST_PAIN]])

# Print or use the numerical prediction directly
print("Predicted outcome related to Lung cancer:", future_prediction[0])

# Print the categorical prediction
# If you want to use the get_reg_prediction function
print("Predicted outcome related to Lung cancer (Yes/No):", get_reg_prediction(future_prediction[0]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters found by GridSearchCV: {'C': 1, 'kernel': 'linear'}
Accuracy on test set: 0.8178571428571428
Confusion Matrix:
 [[  0   0   1]
 [  0  23  32]
 [  0  18 206]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.56      0.42      0.48        55
         2.0       0.86      0.92      0.89       224

    accuracy                           0.82       280
   macro avg       0.47      0.45      0.46       280
weighted avg       0.80      0.82      0.81       280

Gender (0 for F, 1 for M): 1
Age: 55
BLOOD_TYPE : 3
FamilyHistory LC: 1
PARENT_DeathwithReason: 1
SMOKING_STATUS: 2
DAILY_CIGARETTES: 26
YELLOW_SKIN: 1
ANXIETY: 1
PEER_PRESSURE: 2
COPD_DIAGNOSES: 2
FATIGUE: 2
ALLERGY: 1
WHEEZING: 1
ALCOHOL_CONSUMPTION: 2
COUGHING: 2
SHORTNESS_OF_BREATH: 2
SWALLOWING_DIFFICULTY: 1
CHEST_PAIN: 2
Predicted outcome related to Lung cancer: 2.0
Predicted outcome related to Lung cancer (Yes/



# ANN 

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle

indep_X = df1[['GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN']]
dep_Y = df1['LUNG_CANCER']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

# Scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Build the ANN model
model = Sequential()
model.add(Dense(units=32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))  # Use sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=10, validation_split=0.1)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob >= 1).astype(int)  # Convert probabilities to binary class labels

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
clf_report = classification_report(y_test, y_pred)

print("Accuracy on test set:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", clf_report)

# Save the model
model.save("ANN_Lungcancer_finalmodel.h5")

# Load the saved model
from tensorflow.keras.models import load_model
loaded_model = load_model("ANN_Lungcancer_finalmodel.h5")

# Function to make predictions
def get_reg_prediction(prediction):
    return "Yes" if prediction >= 1 else "No"

# Example input values
input_data = {
    "GENOMIC SEX": 1,
    "AGE": 55,
    "BLOOD_TYPE": 3,
    "NUMBER_OF_SIBLINGS": 1,
    "PARENT_ALIVE": 1,
    "SMOKING_STATUS": 2,
    "DAILY_CIGARETTES": 26,
    "YELLOW_SKIN": 1, 
    "ANXIETY": 1,
    "PEER_PRESSURE": 2,
    "COPD_DIAGNOSES": 2,
    "FATIGUE": 2,
    "ALLERGY": 1,
    "WHEEZING": 1,
    "ALCOHOL_CONSUMPTION": 2,
    "COUGHING": 2,
    "SHORTNESS_OF_BREATH": 2,
    "SWALLOWING_DIFFICULTY": 1,
    "CHEST_PAIN": 2}

input_df = pd.DataFrame([input_data])
input_scaled = sc.transform(input_df)

# Make prediction
future_prediction_prob = loaded_model.predict(input_scaled)
future_prediction = (future_prediction_prob >= 1).astype(int)

# Print or use the numerical prediction directly
print("Predicted probability of Lung cancer:", future_prediction_prob[0][0])

# Print the categorical prediction
print("Predicted outcome related to Lung cancer (Yes/No):", get_reg_prediction(future_prediction_prob[0][0]))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2049 - loss: -0.6772 - val_accuracy: 0.0714 - val_loss: -3.7118
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2094 - loss: -5.0613 - val_accuracy: 0.0714 - val_loss: -15.8943
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2181 - loss: -19.3890 - val_accuracy: 0.0714 - val_loss: -48.6477
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2112 - loss: -53.2137 - val_accuracy: 0.0714 - val_loss: -116.1420
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1895 - loss: -128.6266 - val_accuracy: 0.0714 - val_loss: -235.2844
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2071 - loss: -248.3290 - val_accuracy: 0.0714 - val_loss: -416.3846
Epoch 7/50
[1

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2047 - loss: -134893.6094 - val_accuracy: 0.0714 - val_loss: -161297.9375
Epoch 50/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2196 - loss: -140463.6094 - val_accuracy: 0.0714 - val_loss: -169810.8594
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy on test set: 0.19642857142857142
Confusion Matrix:
 [[  0   1   0]
 [  0  55   0]
 [  0 224   0]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.20      1.00      0.33        55
         2.0       0.00      0.00      0.00       224

    accuracy                           0.20       280
   macro avg       0.07      0.33      0.11       280
weighted avg       0.04      0.20      0.06       280





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
Predicted probability of Lung cancer: 1.0
Predicted outcome related to Lung cancer (Yes/No): Yes


# Navie Bayes

In [17]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# Split the data
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

# Initialize and fit the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Predict on test set
y_predict = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_predict)
conf_matrix = confusion_matrix(y_test, y_predict)
clf_report = classification_report(y_test, y_predict)

print("Accuracy on test set:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", clf_report)

# Save the model
filename = "NB_Lungcancer_finalmodel.sav"
pickle.dump(model, open(filename, 'wb'))

# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))

def get_reg_prediction(prediction):
    return "Yes" if prediction >= 1 else "No"

# Example input values
GENOMIC_SEX = int(input("Gender (0 for F, 1 for M): "))
AGE = float(input("Age: "))
BLOOD_TYPE = float(input("BLOOD_TYPE : "))
NUMBER_OF_SIBLINGS = float(input("FamilyHistory LC: "))
PARENT_ALIVE = float(input("PARENT_DeathwithReason: "))
SMOKING_STATUS = float(input("SMOKING_STATUS: "))
DAILY_CIGARETTES = float(input("DAILY_CIGARETTES: "))
YELLOW_SKIN = float(input("YELLOW_SKIN: "))
ANXIETY = float(input("ANXIETY: "))
PEER_PRESSURE = float(input("PEER_PRESSURE: "))
COPD_DIAGNOSES = float(input("COPD_DIAGNOSES: "))
FATIGUE = float(input("FATIGUE: "))
ALLERGY = float(input("ALLERGY: "))
WHEEZING = float(input("WHEEZING: "))
ALCOHOL_CONSUMPTION = float(input("ALCOHOL_CONSUMPTION: "))
COUGHING = float(input("COUGHING: "))
SHORTNESS_OF_BREATH = float(input("SHORTNESS_OF_BREATH: "))
SWALLOWING_DIFFICULTY = float(input("SWALLOWING_DIFFICULTY: "))
CHEST_PAIN = float(input("CHEST_PAIN: "))

# Make prediction
future_prediction = loaded_model.predict([[GENOMIC_SEX, AGE, BLOOD_TYPE, NUMBER_OF_SIBLINGS, PARENT_ALIVE, SMOKING_STATUS,
                                           DAILY_CIGARETTES, YELLOW_SKIN, ANXIETY, PEER_PRESSURE, COPD_DIAGNOSES, FATIGUE,
                                           ALLERGY, WHEEZING, ALCOHOL_CONSUMPTION, COUGHING, SHORTNESS_OF_BREATH,
                                           SWALLOWING_DIFFICULTY, CHEST_PAIN]])

# Print or use the numerical prediction directly
print("Predicted outcome related to Lung cancer:", future_prediction[0])

# Print the categorical prediction
# If you want to use the get_reg_prediction function
print("Predicted outcome related to Lung cancer (Yes/No):", get_reg_prediction(future_prediction[0]))


Accuracy on test set: 0.5607142857142857
Confusion Matrix:
 [[  0   0   1]
 [ 12  27  16]
 [ 70  24 130]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.53      0.49      0.51        55
         2.0       0.88      0.58      0.70       224

    accuracy                           0.56       280
   macro avg       0.47      0.36      0.40       280
weighted avg       0.81      0.56      0.66       280

Gender (0 for F, 1 for M): 1
Age: 55
BLOOD_TYPE : 3
FamilyHistory LC: 1
PARENT_DeathwithReason: 1
SMOKING_STATUS: 2
DAILY_CIGARETTES: 26
YELLOW_SKIN: 1
ANXIETY: 1
PEER_PRESSURE: 2
COPD_DIAGNOSES: 2
FATIGUE: 2
ALLERGY: 1
WHEEZING: 1
ALCOHOL_CONSUMPTION: 2
COUGHING: 2
SHORTNESS_OF_BREATH: 2
SWALLOWING_DIFFICULTY: 1
CHEST_PAIN: 2
Predicted outcome related to Lung cancer: 0.0
Predicted outcome related to Lung cancer (Yes/No): No




# KNN 

In [18]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load and preprocess the dataset
dataset = pd.read_csv("Pre_Lungcancer.csv")
dataset.drop([col for col in dataset.columns if 'Unnamed: 0' in col], axis=1, inplace=True)
df1 = pd.get_dummies(dataset, drop_first=True)

indep_X = df1[['GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN']]
dep_Y = df1['LUNG_CANCER']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

# Initialize and fit the k-NN model
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_predict = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Evaluate the model
accuracy = accuracy_score(y_test, y_predict)
conf_matrix = confusion_matrix(y_test, y_predict)
clf_report = classification_report(y_test, y_predict)

# Print evaluation metrics
print("Accuracy on test set:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", clf_report)

# Save the model
filename = "KNN_Lungcancer_finalmodel.sav"
pickle.dump(model, open(filename, 'wb'))

# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))

def get_classification_prediction(prediction):
    return "Yes" if prediction == 1 else "No"

# Example input values
GENOMIC_SEX = int(input("Gender (0 for F, 1 for M): "))
AGE = float(input("Age: "))
BLOOD_TYPE = float(input("BLOOD_TYPE : "))
NUMBER_OF_SIBLINGS = float(input("FamilyHistory LC: "))
PARENT_ALIVE = float(input("PARENT_DeathwithReason: "))
SMOKING_STATUS = float(input("SMOKING_STATUS: "))
DAILY_CIGARETTES = float(input("DAILY_CIGARETTES: "))
YELLOW_SKIN = float(input("YELLOW_SKIN: "))
ANXIETY = float(input("ANXIETY: "))
PEER_PRESSURE = float(input("PEER_PRESSURE: "))
COPD_DIAGNOSES = float(input("COPD_DIAGNOSES: "))
FATIGUE = float(input("FATIGUE: "))
ALLERGY = float(input("ALLERGY: "))
WHEEZING = float(input("WHEEZING: "))
ALCOHOL_CONSUMPTION = float(input("ALCOHOL_CONSUMPTION: "))
COUGHING = float(input("COUGHING: "))
SHORTNESS_OF_BREATH = float(input("SHORTNESS_OF_BREATH: "))
SWALLOWING_DIFFICULTY = float(input("SWALLOWING_DIFFICULTY: "))
CHEST_PAIN = float(input("CHEST_PAIN: "))

# Make prediction
future_prediction = loaded_model.predict([[GENOMIC_SEX, AGE, BLOOD_TYPE, NUMBER_OF_SIBLINGS, PARENT_ALIVE, SMOKING_STATUS,
                                           DAILY_CIGARETTES, YELLOW_SKIN, ANXIETY, PEER_PRESSURE, COPD_DIAGNOSES, FATIGUE,
                                           ALLERGY, WHEEZING, ALCOHOL_CONSUMPTION, COUGHING, SHORTNESS_OF_BREATH,
                                           SWALLOWING_DIFFICULTY, CHEST_PAIN]])

# Print or use the numerical prediction directly
print("Predicted outcome related to Lung cancer:", future_prediction[0])

# Print the categorical prediction
# If you want to use the get_reg_prediction function
print("Predicted outcome related to Lung cancer (Yes/No):", get_reg_prediction(future_prediction[0]))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy on test set: 0.7928571428571428
Confusion Matrix:
 [[  0   0   1]
 [  0  11  44]
 [  0  13 211]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.46      0.20      0.28        55
         2.0       0.82      0.94      0.88       224

    accuracy                           0.79       280
   macro avg       0.43      0.38      0.39       280
weighted avg       0.75      0.79      0.76       280

Gender (0 for F, 1 for M): 1
Age: 55
BLOOD_TYPE : 3
FamilyHistory LC: 1
PARENT_DeathwithReason: 1
SMOKING_STATUS: 2
DAILY_CIGARETTES: 26
YELLOW_SKIN: 1
ANXIETY: 1
PEER_PRESSURE: 2
COPD_DIAGNOSES: 2
FATIGUE: 2
ALLERGY: 1
WHEEZING: 1
ALCOHOL_CONSUMPTION: 2
COUGHING: 2
SHORTNESS_OF_BREATH: 2
SWALLOWING_DIFFICULTY: 1
CHEST_PAIN: 2
Predicted outcome related to Lung cancer: 2.0
Predicted outcome related to Lung cancer (Yes/No): Yes


