In [5]:
#sheet 3 question 1:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


data = load_breast_cancer()
X = data.data
y = data.target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=2000))
])

pipe.fit(X_train, y_train)


y_pred = pipe.predict(X_test)


acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)


Accuracy: 0.9736842105263158


In [6]:
#Sheet 3 Question 2:

import pandas as pd
import numpy as np




coef = pipe.named_steps['model'].coef_[0]


features = data.feature_names


importance_df = pd.DataFrame({
    'feature': features,
    'importance': np.abs(coef)
})


importance_df = importance_df.sort_values(by='importance', ascending=False)

print(importance_df.head(10))


                 feature  importance
21         worst texture    1.350606
10          radius error    1.268178
28        worst symmetry    1.208200
7    mean concave points    1.119804
26       worst concavity    0.943053
13            area error    0.907186
20          worst radius    0.879840
23            worst area    0.841846
6         mean concavity    0.801458
27  worst concave points    0.778217


In [2]:
# Sheet 4 :Question 4

from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_train_acc = dt.score(X_train, y_train)
dt_test_acc = dt.score(X_test, y_test)

print("Decision Tree")
print("Training Accuracy:", dt_train_acc)
print("Test Accuracy:", dt_test_acc)
print("-" * 40)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf_train_acc = rf.score(X_train, y_train)
rf_test_acc = rf.score(X_test, y_test)

print("Random Forest")
print("Training Accuracy:", rf_train_acc)
print("Test Accuracy:", rf_test_acc)


rf_importances = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 5 Features (Random Forest):")
print(rf_importances.head(5))
print("-" * 40)

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

gb_train_acc = gb.score(X_train, y_train)
gb_test_acc = gb.score(X_test, y_test)

print("Gradient Boosting")
print("Training Accuracy:", gb_train_acc)
print("Test Accuracy:", gb_test_acc)

gb_importances = pd.DataFrame({
    "feature": X.columns,
    "importance": gb.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 5 Features (Gradient Boosting):")
print(gb_importances.head(5))
print("-" * 40)

print("Model Comparison:")
print("Decision Tree Test Accuracy:", dt_test_acc)
print("Random Forest Test Accuracy:", rf_test_acc)
print("Gradient Boosting Test Accuracy:", gb_test_acc)


Decision Tree
Training Accuracy: 1.0
Test Accuracy: 0.9473684210526315
----------------------------------------
Random Forest
Training Accuracy: 1.0
Test Accuracy: 0.9649122807017544

Top 5 Features (Random Forest):
                 feature  importance
23            worst area    0.153892
27  worst concave points    0.144663
7    mean concave points    0.106210
20          worst radius    0.077987
6         mean concavity    0.068001
----------------------------------------
Gradient Boosting
Training Accuracy: 1.0
Test Accuracy: 0.956140350877193

Top 5 Features (Gradient Boosting):
                 feature  importance
7    mean concave points    0.450528
27  worst concave points    0.240103
20          worst radius    0.075589
22       worst perimeter    0.051408
21         worst texture    0.039886
----------------------------------------
Model Comparison:
Decision Tree Test Accuracy: 0.9473684210526315
Random Forest Test Accuracy: 0.9649122807017544
Gradient Boosting Test Accuracy: 

In [4]:
# Sheet 4 Assignment

from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Task 1: Decision Tree (Full & Pruned)

print(" Task 1: Decision Tree: ")

dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)
print("Full Decision Tree")
print("Train Accuracy:", dt_full.score(X_train, y_train))
print("Test Accuracy:", dt_full.score(X_test, y_test))

dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train, y_train)
print("\nPruned Decision Tree (max_depth=3)")
print("Train Accuracy:", dt_pruned.score(X_train, y_train))
print("Test Accuracy:", dt_pruned.score(X_test, y_test))
print("-"*50)

# Task 2: Random Forest
print("Task 2: Random Forest (100 trees) :")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print("Train Accuracy:", rf.score(X_train, y_train))
print("Test Accuracy:", rf.score(X_test, y_test))

rf_importances = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 5 Features (Random Forest):")
print(rf_importances.head(5))
print("-"*50)

# Task 3: Gradient Boosting (default + parameter tuning)
print(" Task 3: Gradient Boosting :")
learning_rates = [0.01, 0.1]
n_estimators_list = [50, 100, 200]

for lr in learning_rates:
    for n_est in n_estimators_list:
        gb = GradientBoostingClassifier(
            learning_rate=lr,
            n_estimators=n_est,
            random_state=42
        )
        gb.fit(X_train, y_train)
        train_acc = gb.score(X_train, y_train)
        test_acc = gb.score(X_test, y_test)
        print(f"learning_rate={lr}, n_estimators={n_est} -> Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

gb_default = GradientBoostingClassifier(random_state=42)
gb_default.fit(X_train, y_train)
gb_importances = pd.DataFrame({
    "feature": X.columns,
    "importance": gb_default.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 5 Features (Gradient Boosting):")
print(gb_importances.head(5))
print("-"*50)

# Task 4
print(" Task 4:")
print("Decision Tree Full Test Acc:", dt_full.score(X_test, y_test))
print("Decision Tree Pruned Test Acc:", dt_pruned.score(X_test, y_test))
print("Random Forest Test Acc:", rf.score(X_test, y_test))
print("Gradient Boosting (default) Test Acc:", gb_default.score(X_test, y_test))


 Task 1: Decision Tree: 
Full Decision Tree
Train Accuracy: 1.0
Test Accuracy: 0.9473684210526315

Pruned Decision Tree (max_depth=3)
Train Accuracy: 0.978021978021978
Test Accuracy: 0.9473684210526315
--------------------------------------------------
Task 2: Random Forest (100 trees) :
Train Accuracy: 1.0
Test Accuracy: 0.9649122807017544

Top 5 Features (Random Forest):
                 feature  importance
23            worst area    0.153892
27  worst concave points    0.144663
7    mean concave points    0.106210
20          worst radius    0.077987
6         mean concavity    0.068001
--------------------------------------------------
 Task 3: Gradient Boosting :
learning_rate=0.01, n_estimators=50 -> Train Acc: 0.9780, Test Acc: 0.9561
learning_rate=0.01, n_estimators=100 -> Train Acc: 0.9868, Test Acc: 0.9561
learning_rate=0.01, n_estimators=200 -> Train Acc: 0.9934, Test Acc: 0.9561
learning_rate=0.1, n_estimators=50 -> Train Acc: 1.0000, Test Acc: 0.9561
learning_rate=0.1, n_