In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset from CSV
df = pd.read_csv("/content/lung_cancer_examples.csv")

# Optional: Display first few rows
print(df.head())

# Drop non-numeric/non-feature columns
df = df.drop(columns=["Name", "Surname"])

# Separate features and target
X = df.drop("Result", axis=1)
y = df["Result"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
lr = LogisticRegression()
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
xgb.fit(X_train_scaled, y_train)

# Evaluate each model
for model, name in zip([lr, rf, xgb], ["Logistic Regression", "Random Forest", "XGBoost"]):
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

# Save best model and scaler
joblib.dump(rf, "/content/lung_cancer_model.pkl")
joblib.dump(scaler, "/content/scaler.pkl")
print("✅ Model and scaler saved to /content/")


     Name      Surname  Age  Smokes  AreaQ  Alkhol  Result
0    John         Wick   35       3      5       4       1
1    John  Constantine   27      20      2       5       1
2  Camela     Anderson   30       0      5       2       0
3    Alex       Telles   28       0      8       1       0
4   Diego     Maradona   68       4      5       6       1

Logistic Regression Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88      0.90        12
weighted avg       0.93      0.92      0.91        12


Random Forest Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88  

Parameters: { "use_label_encoder" } are not used.



In [2]:
# Load model and scaler
model = joblib.load("/content/lung_cancer_model.pkl")
scaler = joblib.load("/content/scaler.pkl")

# Example new patient input: [Age, Smokes, AreaQ, Alkhol]
new_data = [[40, 10, 6, 3]]

# Scale and predict
scaled = scaler.transform(new_data)
prediction = model.predict(scaled)

print("Lung Cancer Risk (1 = High, 0 = Low):", prediction[0])


Lung Cancer Risk (1 = High, 0 = Low): 0




In [3]:
from sklearn.ensemble import VotingClassifier
from google.colab import files

# ✅ Create soft voting ensemble
ensemble = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft'
)

# ✅ Fit the ensemble model
ensemble.fit(X_train_scaled, y_train)

# ✅ Evaluate ensemble
ensemble_pred = ensemble.predict(X_test_scaled)
print("\nEnsemble Accuracy:", accuracy_score(y_test, ensemble_pred))
print(classification_report(y_test, ensemble_pred))

# ✅ Save only the ensemble model
joblib.dump(ensemble, "/content/ensemble_model.pkl")
print("✅ Ensemble model saved at /content/ensemble_model.pkl")

# ✅ Download the model
files.download("/content/ensemble_model.pkl")


Parameters: { "use_label_encoder" } are not used.




Ensemble Accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88      0.90        12
weighted avg       0.93      0.92      0.91        12

✅ Ensemble model saved at /content/ensemble_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
ensemble_model = joblib.load("/content/ensemble_model.pkl")
scaler = joblib.load("/content/scaler.pkl")

new_data = [[50, 5, 6, 2]]
scaled = scaler.transform(new_data)
prediction = ensemble_model.predict(scaled)

print("Lung Cancer Risk (1 = High, 0 = Low):", prediction[0])

Lung Cancer Risk (1 = High, 0 = Low): 0


