In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [2]:
dataset = pd.read_csv("Pre_Lungcancer.csv", index_col=None)
df1 = dataset
df1 = pd.get_dummies(df1, drop_first=True)

indep_X = df1[['GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN']]
dep_Y = df1['LUNG_CANCER']

In [3]:
dataset

Unnamed: 0.1,Unnamed: 0,GENOMIC SEX,AGE,BLOOD_TYPE,NUMBER_OF_SIBLINGS,PARENT_ALIVE,SMOKING_STATUS,DAILY_CIGARETTES,YELLOW_SKIN,ANXIETY,...,COPD_DIAGNOSES,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,0,1.0,69,0.0,5.0,3,1,17.0,2,2,...,1.0,2.0,1,2.0,2,2,2.0,2,2,2.0
1,1,1.0,74,1.0,1.0,2,2,29.0,1,1,...,2.0,2.0,2,1.0,1,1,2.0,2,2,2.0
2,2,0.0,59,2.0,0.0,2,1,17.0,1,1,...,1.0,2.0,1,2.0,1,2,2.0,1,2,1.0
3,3,1.0,63,3.0,3.0,2,2,20.0,2,2,...,1.0,1.0,1,1.0,2,1,1.0,2,2,1.0
4,4,0.0,63,0.0,4.0,1,1,17.0,2,1,...,1.0,1.0,1,2.0,1,2,2.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,1115,0.0,57,0.0,3.0,3,1,17.0,1,1,...,2.0,2.0,1,2.0,2,2,1.0,1,1,1.0
1116,1116,0.0,51,0.0,2.0,2,2,21.0,1,2,...,1.0,1.0,1,1.0,1,1,2.0,1,2,1.0
1117,1117,0.0,65,0.0,1.0,1,1,17.0,2,1,...,1.0,2.0,1,2.0,1,2,1.0,1,1,1.0
1118,1118,0.0,57,0.0,2.0,1,2,21.0,1,2,...,2.0,2.0,1,2.0,1,2,1.0,1,1,1.0


In [4]:
dataset.columns

Index(['Unnamed: 0', 'GENOMIC SEX', 'AGE', 'BLOOD_TYPE', 'NUMBER_OF_SIBLINGS',
       'PARENT_ALIVE', 'SMOKING_STATUS', 'DAILY_CIGARETTES', 'YELLOW_SKIN',
       'ANXIETY', 'PEER_PRESSURE', 'COPD_DIAGNOSES', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL_CONSUMPTION', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
      dtype='object')

In [5]:
# Drop all unnamed columns
df1.drop([col for col in df1.columns if 'Unnamed: 0' in col], axis=1, inplace=True)
df1

Unnamed: 0,GENOMIC SEX,AGE,BLOOD_TYPE,NUMBER_OF_SIBLINGS,PARENT_ALIVE,SMOKING_STATUS,DAILY_CIGARETTES,YELLOW_SKIN,ANXIETY,PEER_PRESSURE,COPD_DIAGNOSES,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1.0,69,0.0,5.0,3,1,17.0,2,2,1,1.0,2.0,1,2.0,2,2,2.0,2,2,2.0
1,1.0,74,1.0,1.0,2,2,29.0,1,1,1,2.0,2.0,2,1.0,1,1,2.0,2,2,2.0
2,0.0,59,2.0,0.0,2,1,17.0,1,1,2,1.0,2.0,1,2.0,1,2,2.0,1,2,1.0
3,1.0,63,3.0,3.0,2,2,20.0,2,2,1,1.0,1.0,1,1.0,2,1,1.0,2,2,1.0
4,0.0,63,0.0,4.0,1,1,17.0,2,1,1,1.0,1.0,1,2.0,1,2,2.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,0.0,57,0.0,3.0,3,1,17.0,1,1,1,2.0,2.0,1,2.0,2,2,1.0,1,1,1.0
1116,0.0,51,0.0,2.0,2,2,21.0,1,2,1,1.0,1.0,1,1.0,1,1,2.0,1,2,1.0
1117,0.0,65,0.0,1.0,1,1,17.0,2,1,1,1.0,2.0,1,2.0,1,2,1.0,1,1,1.0
1118,0.0,57,0.0,2.0,1,2,21.0,1,2,1,2.0,2.0,1,2.0,1,2,1.0,1,1,1.0


In [6]:
# Display the initial number of rows
print(f"Initial number of rows: {dataset.shape[0]}")

# Remove duplicate rows
dataset_cleaned = dataset.drop_duplicates()

Initial number of rows: 1120


In [7]:
# Display the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {dataset_cleaned.shape[0]}")

# Optionally, reset the index if needed
dataset_cleaned.reset_index(drop=True, inplace=True)

Number of rows after removing duplicates: 1120


In [8]:
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

In [9]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [10]:
def Linear(X_train,y_train,X_test):       
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [11]:
def Decision(X_train,y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [12]:
def random(X_train,y_train,X_test):       
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2

In [13]:
def xgboost(X_train,y_train,X_test):       
    from xgboost import XGBRegressor
    regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return r2

In [14]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []  
    # List to store column names for each model
    r2_values = []  
    # List to store R2 values for each model

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature), dep_Y)
        model.fit(X_train, y_train)  # Fit the model
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: Linear
Selected Columns: ['ANXIETY', 'FATIGUE', 'ALLERGY', 'ALCOHOL_CONSUMPTION', 'SWALLOWING_DIFFICULTY']
R2 Value: 0.10275924009396886

Model: Decision
Selected Columns: ['AGE', 'NUMBER_OF_SIBLINGS', 'PARENT_ALIVE', 'DAILY_CIGARETTES', 'SWALLOWING_DIFFICULTY']
R2 Value: -0.47690452867153943

Model: Random
Selected Columns: ['AGE', 'NUMBER_OF_SIBLINGS', 'PARENT_ALIVE', 'DAILY_CIGARETTES', 'PEER_PRESSURE']
R2 Value: 0.01297978047371462

Model: XGBoost
Selected Columns: ['AGE', 'COPD_DIAGNOSES', 'ALLERGY', 'SHORTNESS_OF_BREATH', 'SWALLOWING_DIFFICULTY']
R2 Value: 0.1861826865259968



# Model creation

In [15]:
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Assuming indep_X and dep_Y are predefined
selected_features = indep_X
target_variable = dep_Y

# Assuming indep_X and dep_Y are defined as your features and target variable
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

param_grid = {'kernel': ['linear'],'C': [1]}

grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=3, n_jobs=1)
grid.fit(X_train, y_train)

y_predict = grid.predict(X_test)

# Evaluate using classification metrics
#f1_macro = f1_score(y_test, y_predict, average='weighted')
#clf_report = classification_report(y_test, y_predict)

#print("Best parameters found by GridSearchCV:", grid.best_params_)
## Evaluate using regression metrics
mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

print("Best parameters found by GridSearchCV:", grid.best_params_)
print("Mean Squared Error (MSE) on test set:", mse)
print("R-squared (R2) on test set:", r2)
#print("R-squared (R2) on test set:", r2)
#print("F1 macro score on test set:", f1_macro)
#print("Confusion Matrix:\n", cm)
#print("Classification Report:\n", clf_report)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.053 total time=   2.4s
[CV 2/5] END ................C=1, kernel=linear;, score=0.172 total time=   3.1s
[CV 3/5] END ................C=1, kernel=linear;, score=0.070 total time=   3.6s
[CV 4/5] END ...............C=1, kernel=linear;, score=-0.027 total time=   3.3s
[CV 5/5] END ................C=1, kernel=linear;, score=0.111 total time=   1.8s
Best parameters found by GridSearchCV: {'C': 1, 'kernel': 'linear'}
Mean Squared Error (MSE) on test set: 0.15692949678889448
R-squared (R2) on test set: 0.07292046204134373


Results:
Best parameters found by GridSearchCV: {'C': 1, 'kernel': 'linear'}
Mean Squared Error (MSE) on test set: 0.15692949678889448
R-squared (R2) on test set:  0.07292046204134373

In [16]:
# Define your results as dictionaries
results = {
    "Parameter": ["C", "Kernel", "Mean Squared Error (MSE)", "R-squared (R2)"],
    "Value": [1,'linear',0.15692949678889448,0.07292046204134373]}

# Create a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

                  Parameter     Value
0                         C         1
1                    Kernel    linear
2  Mean Squared Error (MSE)  0.156929
3            R-squared (R2)   0.07292


In [17]:
# Save the best model
filename = "SVR_Lungcancer_finalmodel.sav"
pickle.dump(grid, open(filename, 'wb'))

In [18]:
# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))

In [20]:
def get_reg_prediction(prediction):
    return "Yes" if prediction >= 1.5 else "No"

# Example input values (replace these with your actual input mechanism)
GENOMIC_SEX = int(input("Gender (0 for F, 1 for M): "))
AGE= float(input("Age: "))
BLOOD_TYPE =float(input("BLOOD_TYPE : "))
NUMBER_OF_SIBLINGS = float(input("FamilyHistory LC: "))
PARENT_ALIVE=float(input("PARENT_DeathwithReason: "))
SMOKING_STATUS=float(input("SMOKING_STATUS: "))
DAILY_CIGARETTES =float(input("DAILY_CIGARETTES: "))
YELLOW_SKIN=float(input("YELLOW_SKIN: "))
ANXIETY=float(input("ANXIETY: "))
PEER_PRESSURE=float(input("PEER_PRESSURE: "))
COPD_DIAGNOSES=float(input("COPD_DIAGNOSES: "))
FATIGUE=float(input("FATIGUE: "))
ALLERGY=float(input("ALLERGY: "))
WHEEZING = float(input("WHEEZING: "))
ALCOHOL_CONSUMPTION=float(input("ALCOHOL_CONSUMPTION: "))
COUGHING=float(input("COUGHING:"))
SHORTNESS_OF_BREATH=float(input("SHORTNESS_OF_BREATH:"))
SWALLOWING_DIFFICULTY=float(input("SWALLOWING_DIFFICULTY: "))
CHEST_PAIN=float(input("CHEST_PAIN: "))
#LUNG_CANCER=float(input("LUNG_CANCER: "))

future_prediction = loaded_model.predict([[GENOMIC_SEX,AGE,BLOOD_TYPE,NUMBER_OF_SIBLINGS,PARENT_ALIVE,SMOKING_STATUS,
                                           DAILY_CIGARETTES,YELLOW_SKIN,ANXIETY,PEER_PRESSURE,COPD_DIAGNOSES,FATIGUE,
                                           ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,
                                           SWALLOWING_DIFFICULTY,CHEST_PAIN]])


Gender (0 for F, 1 for M): 1
Age: 55
BLOOD_TYPE : 3
FamilyHistory LC: 1
PARENT_DeathwithReason: 1
SMOKING_STATUS: 2
DAILY_CIGARETTES: 26
YELLOW_SKIN: 1
ANXIETY: 1
PEER_PRESSURE: 2
COPD_DIAGNOSES: 2
FATIGUE: 2
ALLERGY: 1
WHEEZING: 1
ALCOHOL_CONSUMPTION: 2
COUGHING:2
SHORTNESS_OF_BREATH:2
SWALLOWING_DIFFICULTY: 1
CHEST_PAIN: 2




In [21]:
# Print or use the numerical prediction directly
print("Predicted outcome related to Lung cancer:", future_prediction[0])

Predicted outcome related to Lung cancer: 1.9229351132297632


In [22]:
# Print the categorical prediction
# If you want to use the get_reg_prediction function
print("Predicted outcome related to Lung cancer (Yes/No):", get_reg_prediction(future_prediction[0]))

Predicted outcome related to Lung cancer (Yes/No): Yes
