In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. Write a program that demonstrates the advantage of ensemble learning compared to a
single classifier.
Apply a Decision Tree and a Random Forest (RF) classifier on a given dataset.
Compare their performance using evaluation metrics such as accuracy, precision, recall, and
F1-score.
Explore the effect of changing the number of estimators (decision trees) in Random Forest.

In [3]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

wine = load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2,random_state=42)
print("Training a single Decision Tree...")
tree_classifier = DecisionTreeClassifier(random_state=42)

tree_classifier.fit(X_train, y_train)

tree_predictions = tree_classifier.predict(X_test)

tree_accuracy = accuracy_score(y_test, tree_predictions)
print(f"Single Decision Tree Accuracy: {tree_accuracy * 100:.2f}%")


print("Training a Random Forest (an ensemble of 100 trees)...")
forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

forest_classifier.fit(X_train, y_train)
forest_predictions = forest_classifier.predict(X_test)
forest_accuracy = accuracy_score(y_test, forest_predictions)
print(f"Random Forest (Ensemble) Accuracy: {forest_accuracy * 100:.2f}%")
print(f"forest_accuracy -> {forest_accuracy:.4f}, tree_accuracy->{tree_accuracy:.4f}")


Data split: 142 training samples, 36 testing samples.

Training a single Decision Tree...
Single Decision Tree Accuracy: 94.44%

Training a Random Forest (an ensemble of 100 trees)...
Random Forest (Ensemble) Accuracy: 100.00%
forest_accuracy -> 1.0000, tree_accuracy->0.9444


2. Write a program that demonstrates the use of simple ensemble techniques: Max Voting,
Average Voting, and Weighted Average Voting (assign weights based on each model’s
performance).

In [11]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

wine = load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data,wine.target, test_size=0.2,random_state=42)

dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=5, random_state=42) # A small forest
lr = LogisticRegression(random_state=42, max_iter=10000)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_lr = lr.predict(X_test)

predictions = np.array([y_pred_dt, y_pred_rf, y_pred_lr]).T

y_pred_max = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)


y_prob_dt = dt.predict_proba(X_test)
y_prob_rf = rf.predict_proba(X_test)
y_prob_lr = lr.predict_proba(X_test)


y_prob_avg = (y_prob_dt + y_prob_rf + y_prob_lr) / 3.0

y_pred_avg = np.argmax(y_prob_avg, axis=1)

acc_dt = accuracy_score(y_test, y_pred_dt)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_lr = accuracy_score(y_test, y_pred_lr)

weights = np.array([acc_dt, acc_rf, acc_lr])

normalized_weights = weights / weights.sum()

y_prob_weighted = (y_prob_dt * normalized_weights[0] + y_prob_rf * normalized_weights[1] + y_prob_lr * normalized_weights[2])

y_pred_weighted = np.argmax(y_prob_weighted, axis=1)

def evaluate_model(name, y_true, y_pred):
    print(f"\n--- {name} ---")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"F1-score : {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")

print("\n Model Evaluations ")

# Evaluate all our models
evaluate_model("Decision Tree (Single)", y_test, y_pred_dt)
evaluate_model("Random Forest (5 Trees)", y_test, y_pred_rf)
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Ensemble: Max Voting", y_test, y_pred_max)
evaluate_model("Ensemble: Average Voting", y_test, y_pred_avg)
evaluate_model("Ensemble: Weighted Average", y_test, y_pred_weighted)



 Model Evaluations 

--- Decision Tree (Single) ---
Accuracy : 0.9444
Precision: 0.9463
Recall   : 0.9444
F1-score : 0.9440

--- Random Forest (5 Trees) ---
Accuracy : 0.8889
Precision: 0.8889
Recall   : 0.8889
F1-score : 0.8889

--- Logistic Regression ---
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1-score : 1.0000

--- Ensemble: Max Voting ---
Accuracy : 0.9722
Precision: 0.9741
Recall   : 0.9722
F1-score : 0.9718

--- Ensemble: Average Voting ---
Accuracy : 0.9722
Precision: 0.9741
Recall   : 0.9722
F1-score : 0.9722

--- Ensemble: Weighted Average ---
Accuracy : 0.9722
Precision: 0.9741
Recall   : 0.9722
F1-score : 0.9722


3.Write a program to show the difference between Hard Voting and Soft Voting classifiers in
ensemble learning using multiple base learners (e.g., Decision Tree, Logistic Regression, and
KNN)

In [14]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

wine = load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data,wine.target, test_size=0.3,random_state=42)

print(f"Using {len(X_train)} samples for training and {len(X_test)} for testing.\n")

model_1 = LogisticRegression(random_state=42, max_iter=10000)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = KNeighborsClassifier(n_neighbors=5)

hard_voting_clf = VotingClassifier(
    estimators=[('lr', model_1), ('dt', model_2), ('knn', model_3)],voting='hard')

print("Training the Hard Voting classifier...")
hard_voting_clf.fit(X_train, y_train)
y_pred_hard = hard_voting_clf.predict(X_test)
acc_hard = accuracy_score(y_test, y_pred_hard)

print(f"Hard Voting Accuracy: {acc_hard * 100:.2f}%")


soft_voting_clf = VotingClassifier(
    estimators=[('lr', model_1),('dt', model_2), ('knn', model_3)],voting='soft')

print("\nTraining the Soft Voting classifier...")
soft_voting_clf.fit(X_train, y_train)
y_pred_soft = soft_voting_clf.predict(X_test)
acc_soft = accuracy_score(y_test, y_pred_soft)

print(f"Soft Voting Accuracy: {acc_soft * 100:.2f}%")

print("\n--- Comparison ---")
print(f"Hard Voting (Max Votes) : {acc_hard * 100:.2f}%")
print(f"Soft Voting (Avg Probs): {acc_soft * 100:.2f}%")


Hard Voting Accuracy: 100.00%
Soft Voting Accuracy: 98.15%

--- Comparison ---
Hard Voting (Max Votes) : 100.00%
Soft Voting (Avg Probs): 98.15%


4. Write a program using the RandomForestRegressor model to make predictions on a
suitable regression dataset.
Enable and observe the oob_score (Out-of-Bag score) parameter.
Interpret the results and explain its significance.

In [26]:
import numpy as np
from sklearn.datasets import load_diabetes  # <-- CHANGED: Using the Diabetes dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

diabetes = load_diabetes() 
X, y = diabetes.data, diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100,random_state=42,oob_score=True)
rf_model.fit(X_train, y_train)
oob_score = rf_model.oob_score_
print(f"\nModel's Out-of-Bag (OOB) Score: {oob_score:.4f}")

y_pred = rf_model.predict(X_test)

test_score = r2_score(y_test, y_pred)
print(f"Model's Test Set Score        : {test_score:.4f}")
print("\n--- What does this mean? ---")
print(f"""
What is the OOB Score?
- A Random Forest is a "forest" of many decision trees.
- Each tree is trained on a random *sample* of the training data.
- This means each tree *misses* about 1/3 of the training data. This "left out" data is called its "Out-of-Bag" (OOB) data.
- The OOB Score is a "test" performed *during training*. For each data point, the model uses all the trees that *did not* see that point to make a prediction.
- It then compares all these "OOB predictions" to the true answers.

Significance:
- The OOB Score is a reliable estimate of how well the model will perform on *new, unseen data*.
- It's like a "free" test score you get without needing a separate test set.
- **Notice how our OOB Score ({oob_score:.4f}) is very close to our Test Set Score ({test_score:.4f}).**
- This closeness gives us confidence that our model is good at generalizing and isn't just "memorizing" the training data.
""")


Model's Out-of-Bag (OOB) Score: 0.4294
Model's Test Set Score        : 0.4703

--- What does this mean? ---

What is the OOB Score?
- A Random Forest is a "forest" of many decision trees.
- Each tree is trained on a random *sample* of the training data.
- This means each tree *misses* about 1/3 of the training data. This "left out" data is called its "Out-of-Bag" (OOB) data.
- The OOB Score is a "test" performed *during training*. For each data point, the model uses all the trees that *did not* see that point to make a prediction.
- It then compares all these "OOB predictions" to the true answers.

Significance:
- The OOB Score is a reliable estimate of how well the model will perform on *new, unseen data*.
- It's like a "free" test score you get without needing a separate test set.
- **Notice how our OOB Score (0.4294) is very close to our Test Set Score (0.4703).**
- This closeness gives us confidence that our model is good at generalizing and isn't just "memorizing" the training da

5. Write a program to explore different Boosting techniques using suitable datasets:
Adaptive Boosting (AdaBoost) – binary classification.

In [28]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
weak_learner = DecisionTreeClassifier(max_depth=1, random_state=42)
weak_learner.fit(X_train, y_train)
y_pred_weak = weak_learner.predict(X_test)

acc_weak = accuracy_score(y_test, y_pred_weak)
print(f"Accuracy of one weak tree: {acc_weak * 100:.2f}%")
adaboost_model = AdaBoostClassifier(estimator=weak_learner,n_estimators=50,random_state=42)

adaboost_model.fit(X_train, y_train)
y_pred_adaboost = adaboost_model.predict(X_test)

acc_adaboost = accuracy_score(y_test, y_pred_adaboost)
print(f"Accuracy of AdaBoost model: {acc_adaboost * 100:.2f}%")

print("\n--- What does this mean? ---")
print(f"""
1. The single, weak tree (stump) was only {acc_weak * 100:.2f}% accurate.
2. The AdaBoost model, which combined 50 of these stumps, achieved {acc_adaboost * 100:.2f}% accuracy.

Significance:
AdaBoost (Adaptive Boosting) creates a powerful, accurate model
by "boosting" the performance of many simple, weak ones.

It trains them one after another, and each new tree focuses on
fixing the mistakes made by the previous trees. This sequential
learning process is what makes boosting models so effective.
""")


Accuracy of one weak tree: 89.47%
Accuracy of AdaBoost model: 97.66%

--- What does this mean? ---

1. The single, weak tree (stump) was only 89.47% accurate.
2. The AdaBoost model, which combined 50 of these stumps, achieved 97.66% accuracy.

Significance:
AdaBoost (Adaptive Boosting) creates a powerful, accurate model
by "boosting" the performance of many simple, weak ones.

It trains them one after another, and each new tree focuses on
fixing the mistakes made by the previous trees. This sequential
learning process is what makes boosting models so effective.

