# Step - 1: Imporing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'pandas'

# Step - 2 : Data Wrangling

In [2]:
data = pd.read_csv("../datasets/LungCancer_CSVDataset/survey lung cancer.csv")
print(data.shape)
print(data.head(5))

(309, 16)
  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0                    2            

In [3]:
# Convert categorical columns
data["GENDER"] = data["GENDER"].map({"M": 1, "F": 0})
data["LUNG_CANCER"] = data["LUNG_CANCER"].map({"YES": 1, "NO": 0})

# Splitting dataset
X = data.drop("LUNG_CANCER", axis=1)
y = data["LUNG_CANCER"]

In [4]:
# Normalize age and other numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train/test data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert to tensors for PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [5]:
import joblib
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

# Step - 3: Building the Model

In [6]:
# Define a simple PyTorch Model (MLP)
class CancerNN(nn.Module):
    def __init__(self):
        super(CancerNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [7]:
# Initialize model, loss, and optimizer
model = CancerNN()
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step - 5: Training the Model

In [8]:
# Training the Neural Network
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 10, Loss: 0.4737773835659027
Epoch 20, Loss: 0.2655992805957794
Epoch 30, Loss: 0.1889576017856598
Epoch 40, Loss: 0.15550854802131653
Epoch 50, Loss: 0.12664391100406647
Epoch 60, Loss: 0.10489054024219513
Epoch 70, Loss: 0.09095268696546555
Epoch 80, Loss: 0.080286405980587
Epoch 90, Loss: 0.07306680083274841
Epoch 100, Loss: 0.0676325261592865


# Step - 6: Evaluation of the Model

In [9]:
# Evaluate the model
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    predictions = (test_outputs > 0.5).float()
    accuracy = accuracy_score(y_test_tensor.numpy(), predictions.numpy())
    print(f"Neural Network Accuracy: {accuracy:.2f}")

Neural Network Accuracy: 0.95


# Step - 7: Building Secondary Learners

In [10]:
# Train Machine Learning Models
rf = RandomForestClassifier(n_estimators=50)
gb = GradientBoostingClassifier(n_estimators=50)
ada = AdaBoostClassifier(n_estimators=50)

In [11]:
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [12]:
# Predictions
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)
ada_pred = ada.predict(X_test)

In [13]:
# Average the predictions (Ensemble)
ensemble_pred = (rf_pred + gb_pred + ada_pred) / 3
ensemble_pred = np.round(ensemble_pred)  # Convert to binary

In [14]:
# Calculate accuracy
rf_acc = accuracy_score(y_test, rf_pred)
gb_acc = accuracy_score(y_test, gb_pred)
ada_acc = accuracy_score(y_test, ada_pred)
ensemble_acc = accuracy_score(y_test, ensemble_pred)

In [15]:
print(f"Random Forest Accuracy: {rf_acc:.2f}")
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")
print(f"AdaBoost Accuracy: {ada_acc:.2f}")
print(f"Ensemble Accuracy: {ensemble_acc:.2f}")

Random Forest Accuracy: 0.97
Gradient Boosting Accuracy: 0.97
AdaBoost Accuracy: 0.98
Ensemble Accuracy: 0.97


# Step - 8: Deployment and Analysis

In [17]:
torch.save(model.state_dict(), "../models/cancer_detector.pth")

In [18]:
import joblib

joblib.dump(rf, "../models/random_forest.pkl")
joblib.dump(gb, "../models/gradient_boosting.pkl")
joblib.dump(ada, "../models/adaboost.pkl")

['../models/adaboost.pkl']

In [19]:
rf = joblib.load("../models/random_forest.pkl")
gb = joblib.load("../models/gradient_boosting.pkl")
ada = joblib.load("../models/adaboost.pkl")

# Load the saved model
model = CancerNN()
model.load_state_dict(torch.load("../models/cancer_detector.pth"))
print(model.eval())  # Set to evaluation mode

print("All the models have been loaded")

CancerNN(
  (fc1): Linear(in_features=15, out_features=16, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=16, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
All the models have been loaded


In [20]:
sample_data = pd.DataFrame({
    "GENDER": [1],  # M -> 1, F -> 0
    "AGE": [65],
    "SMOKING": [2],
    "YELLOW_FINGERS": [1],
    "ANXIETY": [1],
    "PEER_PRESSURE": [2],
    "CHRONIC_DISEASE": [1],
    "FATIGUE": [2],
    "ALLERGY": [1],
    "WHEEZING": [2],
    "ALCOHOL_CONSUMING": [2],
    "COUGHING": [1],
    "SHORTNESS_OF_BREATH": [2],
    "SWALLOWING_DIFFICULTY": [1],
    "CHEST_PAIN": [2]
})

# Normalize input using the saved scaler
sample_scaled = scaler.transform(sample_data)

# Convert to PyTorch tensor
sample_tensor = torch.tensor(sample_scaled, dtype=torch.float32)

# Predict using Neural Network
with torch.no_grad():
    nn_prediction = model(sample_tensor).item()
    nn_prediction = 1 if nn_prediction > 0.5 else 0  # Convert probability to class

# Predict using ML Models
rf_pred = rf.predict(sample_scaled)[0]
gb_pred = gb.predict(sample_scaled)[0]
ada_pred = ada.predict(sample_scaled)[0]

# Ensemble (average voting)
ensemble_pred = round((rf_pred + gb_pred + ada_pred) / 3)

print(f"Neural Network Prediction: {nn_prediction}")
print(f"Random Forest Prediction: {rf_pred}")
print(f"Gradient Boosting Prediction: {gb_pred}")
print(f"AdaBoost Prediction: {ada_pred}")
print(f"Ensemble Prediction (Final Decision): {ensemble_pred}")

Neural Network Prediction: 1
Random Forest Prediction: 1
Gradient Boosting Prediction: 1
AdaBoost Prediction: 1
Ensemble Prediction (Final Decision): 1


Feature names unseen at fit time:
- ALCOHOL_CONSUMING
- ALLERGY
- CHEST_PAIN
- CHRONIC_DISEASE
- FATIGUE
- ...
Feature names seen at fit time, yet now missing:
- ALCOHOL CONSUMING
- ALLERGY 
- CHEST PAIN
- CHRONIC DISEASE
- FATIGUE 
- ...

