In [1]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("LungCancerPreprocessing").getOrCreate()

# Load the dataset
file_path = "/content/Lung_Cancer.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show schema and a few records
df.printSchema()
df.show(5)


root
 |-- id: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- diagnosis_date: date (nullable = true)
 |-- cancer_stage: string (nullable = true)
 |-- family_history: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- cholesterol_level: integer (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- asthma: integer (nullable = true)
 |-- cirrhosis: integer (nullable = true)
 |-- other_cancer: integer (nullable = true)
 |-- treatment_type: string (nullable = true)
 |-- end_treatment_date: date (nullable = true)
 |-- survived: integer (nullable = true)

+---+----+------+-----------+--------------+------------+--------------+--------------+----+-----------------+------------+------+---------+------------+--------------+------------------+--------+
| id| age|gender|    country|diagnosis_date|cancer_stage|family_history|smoking

In [None]:
# Step 1: Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LungCancerPreprocessing").getOrCreate()

# Step 2: Load Dataset
df = spark.read.csv("/content/Lung_Cancer.csv", header=True, inferSchema=True)

# Step 3: Basic Cleanup
df = df.dropna()

# Step 4: String Indexing for Categorical Columns
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# Identify categorical columns (excluding target)
categorical_cols = [col for col, dtype in df.dtypes if dtype == 'string' and col != "survived"]

# Create StringIndexers for each categorical column with '_indexed' suffix
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed") for col in categorical_cols]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df).transform(df)

# Step 5: Select and Rename Proper Columns
# Drop original string columns, keep numeric and indexed ones
indexed_cols = [col+"_indexed" for col in categorical_cols]
non_categorical_cols = [col for col in df.columns if col not in categorical_cols]

# Combine everything for export
final_cols = non_categorical_cols + indexed_cols

# Select only those columns
df_cleaned = df_indexed.select(*final_cols)

# Step 6: Encode Target Column
from pyspark.ml.feature import StringIndexer

if "survived" in df.columns:
    label_indexer = StringIndexer(inputCol="survived", outputCol="label")
    df_final = label_indexer.fit(df_cleaned).transform(df_cleaned)
else:
    raise Exception("❌ 'survived' column not found in the dataset!")

# Step 7: Undersampling to fix class imbalance
label_counts = df_final.groupBy("label").count().collect()
min_count = min([row["count"] for row in label_counts])

balanced_df = None

for row in label_counts:
    cls_df = df_final.filter(df_final["label"] == row["label"]).limit(min_count)
    balanced_df = cls_df if balanced_df is None else balanced_df.union(cls_df)

# Step 8: Reorder Columns as per your request
desired_order = [
    "id", "age", "bmi", "cholesterol_level", "hypertension", "asthma", "cirrhosis", "other_cancer",
    "gender_indexed", "country_indexed", "cancer_stage_indexed", "family_history_indexed",
    "smoking_status_indexed", "treatment_type_indexed", "survived"
]

# Step 9: Export Final CSV with Required Format
balanced_df.select(*desired_order) \
    .write.mode("overwrite") \
    .option("header", "true") \
    .csv("lung_cancer_balanced_output")


In [3]:
!pip install xgboost




In [None]:
import pandas as pd
import glob
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 🔹 Step 1: Load all CSV files generated by Spark
csv_files = glob.glob("/content/lung_cancer_ml_ready_csv/*.csv")

# 🔹 Step 2: Read and combine them into one DataFrame
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)




#---------EDA----------------------#
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 🔎 Basic Info
print("🧾 Shape of dataset:", df.shape)
print("\n📋 Column types:\n", df.dtypes)
print("\n🔍 Missing values:\n", df.isnull().sum())

# 📈 Distribution of target variable
print("\n🎯 Label Distribution:")
print(df['label'].value_counts(normalize=True))

plt.figure(figsize=(6,4))
sns.countplot(x='label', data=df)
plt.title("Distribution of Target Label (Survived)")
plt.show()

# 📊 Boxplots to check outliers
plt.figure(figsize=(12, 6))
df.iloc[:, :10].boxplot()
plt.title("📦 Boxplot of First 10 Features")
plt.xticks(rotation=45)
plt.show()

# 📉 Correlation Matrix
plt.figure(figsize=(15, 10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("🔗 Correlation Heatmap")
plt.show()

# 📊 Histogram for a few features
df.iloc[:, :6].hist(figsize=(12, 8), bins=20)
plt.suptitle("📊 Feature Distributions", fontsize=16)
plt.show()


# 🔹 Step 3: Separate Features and Labels
X = df.drop("label", axis=1)
y = df["label"]

# 🔹 Step 4: Split into Train & Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 🔹 Step 5: Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("📊 Random Forest Classification Report:\n")
print(classification_report(y_test, rf_preds))
print("✅ Random Forest Accuracy:", round(accuracy_score(y_test, rf_preds), 4))

# 🔹 Step 6: Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print("\n📊 XGBoost Classification Report:\n")
print(classification_report(y_test, xgb_preds))
print("✅ XGBoost Accuracy:", round(accuracy_score(y_test, xgb_preds), 4))


🔍 Pehle Performance samjho:
✅ Accuracy:
Random Forest: 85.94%

XGBoost: 85.86%

📈 Classification Report ke hisaab se:
Class 0 (No Cancer):

Precision: 0.78 → Prediction mein thoda galti ho raha hai.

Recall: 1.00 → Matlab jo actual non-cancer patients the, unko correctly pakad liya.

Class 1 (Cancer):

Precision: 1.00 → Cancer bol diya to pakka cancer hai.

Recall: 0.72 → Matlab 28% actual cancer cases chhoot ja rahe hain.

In [None]:
# 🔸 Save trained Random Forest model
import pickle

with open("basic_random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

# 🔸 Save trained XGBoost model
with open("basic_xgboost_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)


In [None]:
import pandas as pd
import glob
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 🔹 Step 1: Load all CSV files generated by Spark
csv_files = glob.glob("/content/lung_cancer_ml_ready_csv/*.csv")

# 🔹 Step 2: Read and combine them into one DataFrame
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)

# 🔹 Step 3: Separate Features and Labels
X = df.drop("label", axis=1)
y = df["label"]

# 🔹 Step 4: Split into Train & Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------------------------
# 🔹 Step 5: Random Forest Hyperparameter Tuning
# -----------------------------------------------
param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None]
}

rf_random = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=10,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

rf_random.fit(X_train, y_train)
best_rf = rf_random.best_estimator_
rf_preds = best_rf.predict(X_test)

print("📊 Random Forest (Tuned) Classification Report:\n")
print(classification_report(y_test, rf_preds))
print("✅ Random Forest Accuracy:", round(accuracy_score(y_test, rf_preds), 4))

# -----------------------------------------------
# 🔹 Step 6: XGBoost Hyperparameter Tuning
# -----------------------------------------------
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [100, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_random = RandomizedSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_distributions=param_grid_xgb,
    n_iter=10,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

xgb_random.fit(X_train, y_train)
best_xgb = xgb_random.best_estimator_
xgb_preds = best_xgb.predict(X_test)

print("\n📊 XGBoost (Tuned) Classification Report:\n")
print(classification_report(y_test, xgb_preds))
print("✅ XGBoost Accuracy:", round(accuracy_score(y_test, xgb_preds), 4))


In [None]:
import pickle

# ✅ Save model using 'wb' and pickle.dump()
#rand
with open("best_random_forest_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)

with open("best_xgboost_model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)


In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load your cleaned dataset
df = pd.read_csv("resampled_lung_cancer_data.csv")  # CSV you got from PySpark

# Step 2: Split features and target
X = df.drop("Lung_Cancer", axis=1)
y = df["Lung_Cancer"]

# Step 3: OneHotEncoder for categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = encoder.fit_transform(X[categorical_cols])

# Save encoder as preprocessing pipeline
joblib.dump(encoder, "preprocessing_pipeline.pkl")

# Save feature names after encoding
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
joblib.dump(encoded_feature_names.tolist(), "feature_names.pkl")

print("✅ preprocessing_pipeline.pkl and feature_names.pkl saved.")
