<a href="https://colab.research.google.com/github/Pawandeep786/MachineLearning_Algorithms_Pawandeep786/blob/main/DT_PCA_LungCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
import pandas as pd

df = pd.read_csv("Lung_Cancer__dataset.csv")
print(df.head())
print(df.columns)



     Name      Surname  Age  Smokes  AreaQ  Alkhol  Result
0    John         Wick   35       3      5       4       1
1    John  Constantine   27      20      2       5       1
2  Camela     Anderson   30       0      5       2       0
3    Alex       Telles   28       0      8       1       0
4   Diego     Maradona   68       4      5       6       1
Index(['Name', 'Surname', 'Age', 'Smokes', 'AreaQ', 'Alkhol', 'Result'], dtype='object')


In [64]:
# Step 1: Load data (file name same hona chahiye jo aapne upload kiya)
df = pd.read_csv("Lung_Cancer__dataset.csv")

# Jaldi se columns aur nulls check
print(df.shape)
print(df.columns.tolist())
print(df.isnull().sum())
df.head()


(59, 7)
['Name', 'Surname', 'Age', 'Smokes', 'AreaQ', 'Alkhol', 'Result']
Name       0
Surname    0
Age        0
Smokes     0
AreaQ      0
Alkhol     0
Result     0
dtype: int64


Unnamed: 0,Name,Surname,Age,Smokes,AreaQ,Alkhol,Result
0,John,Wick,35,3,5,4,1
1,John,Constantine,27,20,2,5,1
2,Camela,Anderson,30,0,5,2,0
3,Alex,Telles,28,0,8,1,0
4,Diego,Maradona,68,4,5,6,1


In [65]:
print(df.columns.tolist())


['Name', 'Surname', 'Age', 'Smokes', 'AreaQ', 'Alkhol', 'Result']


In [66]:
# Step 2: identify columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
numeric_cols, categorical_cols


(['Age', 'Smokes', 'AreaQ', 'Alkhol'], [])

In [67]:
# Step 5: Baseline DT pipeline
dt_baseline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", DecisionTreeClassifier(random_state=42))
])

dt_baseline.fit(X_train, y_train)
y_pred_base = dt_baseline.predict(X_test)

print("=== Baseline Decision Tree ===")
print("Accuracy :", accuracy_score(y_test, y_pred_base))
print("Precision:", precision_score(y_test, y_pred_base, average="weighted", zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_base, average="weighted", zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_base, average="weighted", zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_base))
print("\nClassification Report:\n", classification_report(y_test, y_pred_base, zero_division=0))


=== Baseline Decision Tree ===
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1-score : 1.0

Confusion Matrix:
 [[6 0]
 [0 6]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         6

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12



In [68]:
# Step 6: Baseline feature importance (approx)
# Column names after preprocessing
feature_names = dt_baseline.named_steps["preprocess"].get_feature_names_out()
importances = dt_baseline.named_steps["clf"].feature_importances_

fi = pd.DataFrame({"feature": feature_names, "importance": importances})
fi = fi.sort_values("importance", ascending=False).head(10)
fi


Unnamed: 0,feature,importance
3,num__Alkhol,0.792048
2,num__AreaQ,0.143861
0,num__Age,0.064091
1,num__Smokes,0.0


In [69]:
# Step 7: PCA + DT pipeline
dt_with_pca = Pipeline(steps=[
    ("preprocess", preprocess),
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("clf", DecisionTreeClassifier(random_state=42))
])

dt_with_pca.fit(X_train, y_train)
y_pred_pca = dt_with_pca.predict(X_test)

print("=== Decision Tree WITH PCA (>=95% variance) ===")
print("Accuracy :", accuracy_score(y_test, y_pred_pca))
print("Precision:", precision_score(y_test, y_pred_pca, average="weighted", zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_pca, average="weighted", zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_pca, average="weighted", zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_pca))

# Variance confirm
pca_step = dt_with_pca.named_steps["pca"]
print("\n#Components:", pca_step.n_components_)
print("Explained variance retained:", round(pca_step.explained_variance_ratio_.sum(), 4))


=== Decision Tree WITH PCA (>=95% variance) ===
Accuracy : 0.9166666666666666
Precision: 0.9285714285714285
Recall   : 0.9166666666666666
F1-score : 0.9160839160839161

Confusion Matrix:
 [[5 1]
 [0 6]]

#Components: 4
Explained variance retained: 1.0


In [70]:
# Step 8: Comparison summary (easy to copy)
base = {
    "Accuracy": accuracy_score(y_test, y_pred_base),
    "Precision(w)": precision_score(y_test, y_pred_base, average="weighted", zero_division=0),
    "Recall(w)": recall_score(y_test, y_pred_base, average="weighted", zero_division=0),
    "F1(w)": f1_score(y_test, y_pred_base, average="weighted", zero_division=0)
}
pca = {
    "Accuracy": accuracy_score(y_test, y_pred_pca),
    "Precision(w)": precision_score(y_test, y_pred_pca, average="weighted", zero_division=0),
    "Recall(w)": recall_score(y_test, y_pred_pca, average="weighted", zero_division=0),
    "F1(w)": f1_score(y_test, y_pred_pca, average="weighted", zero_division=0)
}
print("Baseline DT:", base)
print("DT + PCA   :", pca)


Baseline DT: {'Accuracy': 1.0, 'Precision(w)': 1.0, 'Recall(w)': 1.0, 'F1(w)': 1.0}
DT + PCA   : {'Accuracy': 0.9166666666666666, 'Precision(w)': 0.9285714285714285, 'Recall(w)': 0.9166666666666666, 'F1(w)': 0.9160839160839161}
