In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import joblib


In [3]:
df = pd.read_csv("data.csv")

In [5]:
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [5]:
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,1,2,2,2,1,1,2,2,2,2,1,YES
305,M,70,2,1,1,1,1,2,2,2,2,2,2,1,2,YES
306,M,58,2,1,1,1,1,1,2,2,2,2,1,1,2,YES
307,M,67,2,1,2,1,1,2,2,1,2,2,2,1,2,YES


In [6]:
lung_cancer_schema = {
    "Gender": {"M": "Male", "F": "Female"},
    "Age": "Age of the patient (integer)",
    "Smoking": {"YES": 2, "NO": 1},
    "Yellow_Fingers": {"YES": 2, "NO": 1},
    "Anxiety": {"YES": 2, "NO": 1},
    "Peer_Pressure": {"YES": 2, "NO": 1},
    "Chronic_Disease": {"YES": 2, "NO": 1},
    "Fatigue": {"YES": 2, "NO": 1},
    "Allergy": {"YES": 2, "NO": 1},
    "Wheezing": {"YES": 2, "NO": 1},
    "Alcohol": {"YES": 2, "NO": 1},
    "Coughing": {"YES": 2, "NO": 1},
    "Shortness_of_Breath": {"YES": 2, "NO": 1},
    "Swallowing_Difficulty": {"YES": 2, "NO": 1},
    "Chest_Pain": {"YES": 2, "NO": 1},
    "Lung_Cancer": {"YES": "Positive", "NO": "Negative"}
}


In [7]:
df["GENDER"] = df["GENDER"].map({"M": 0, "F": 1})
df["LUNG_CANCER"] = df["LUNG_CANCER"].map({"YES": 1, "NO": 0}) 
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,0,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,1,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,1,56,1,1,1,2,2,2,1,1,2,2,2,2,1,1
305,0,70,2,1,1,1,1,2,2,2,2,2,2,1,2,1
306,0,58,2,1,1,1,1,1,2,2,2,2,1,1,2,1
307,0,67,2,1,2,1,1,2,2,1,2,2,2,1,2,1


In [9]:
X = df.drop(columns=["LUNG_CANCER"])  # Features
y = df["LUNG_CANCER"]  # Target (1 = YES, 0 = NO)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_estimators=1000, learning_rate=0.1, max_depth=10)

In [44]:
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [45]:
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 91.94%
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.62      0.67        16
           1       0.95      0.96      0.95       108

    accuracy                           0.92       124
   macro avg       0.83      0.79      0.81       124
weighted avg       0.92      0.92      0.92       124



In [46]:
joblib.dump(model, "lung_cancer_model.pkl")


['lung_cancer_model.pkl']