<a href="https://colab.research.google.com/github/Odima-dev/Data-Science-and-Machine-Learning/blob/main/EnsembleLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Loading and Preparing Regression Dataset to be Used

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Loading train.csv
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('train.csv')
X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

# Dividing train.csv into 80% of training data and 20% of verification data
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0)

print(f"Training size: {X_train.shape}, Validation size: {X_val.shape}")

Saving train.csv to train.csv
Training size: (1168, 2), Validation size: (292, 2)


In [2]:
# Problem 1: Blending scratch mounting

# Fitting different models
model1 = LinearRegression()
model2 = SVR(kernel='rbf', C=1e3, gamma=0.1)
model3 = DecisionTreeRegressor(max_depth=5, random_state=0)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# Predicting on validation set
pred1 = model1.predict(X_val)
pred2 = model2.predict(X_val)
pred3 = model3.predict(X_val)

# Blending: simple average
blended_pred = (pred1 + pred2 + pred3) / 3

# Evaluating
mse1 = mean_squared_error(y_val, pred1)
mse2 = mean_squared_error(y_val, pred2)
mse3 = mean_squared_error(y_val, pred3)
mse_blended = mean_squared_error(y_val, blended_pred)

print("\n[Blending Results]")
print(f"Linear Regression MSE: {mse1:.2f}")
print(f"SVR MSE: {mse2:.2f}")
print(f"Decision Tree MSE: {mse3:.2f}")
print(f"Blended Model MSE: {mse_blended:.2f}")



[Blending Results]
Linear Regression MSE: 2942066921.67
SVR MSE: 7206623146.12
Decision Tree MSE: 2169961248.67
Blended Model MSE: 2860614890.72


In [3]:
# Problem 2: Scratch mounting of bagging

np.random.seed(0)
n_models = 5
bagged_preds = []

for i in range(n_models):
    # Bootstrap sample
    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_sample = X_train[indices]
    y_sample = y_train[indices]

    # Using simple regressor for bagging (Decision Tree)
    model = DecisionTreeRegressor(max_depth=5, random_state=i)
    model.fit(X_sample, y_sample)
    pred = model.predict(X_val)
    bagged_preds.append(pred)

# Average predictions
bagged_pred = np.mean(bagged_preds, axis=0)

# Evaluating
mse_single_tree = mean_squared_error(y_val, model3.predict(X_val))
mse_bagged = mean_squared_error(y_val, bagged_pred)

print("\n[Bagging Results]")
print(f"Single Decision Tree MSE: {mse3:.2f}")
print(f"Bagged Trees MSE: {mse_bagged:.2f}")



[Bagging Results]
Single Decision Tree MSE: 2169961248.67
Bagged Trees MSE: 1864051360.97


In [4]:
# Problem 3: Stacking scratch mounting

# Stage 0: Trainning base models
base1 = LinearRegression()
base2 = DecisionTreeRegressor(max_depth=5, random_state=1)

base1.fit(X_train, y_train)
base2.fit(X_train, y_train)

# Blended data: base model predictions (level 1 features)
base1_train_pred = base1.predict(X_train)
base2_train_pred = base2.predict(X_train)

stacked_X_train = np.vstack((base1_train_pred, base2_train_pred)).T

# Stage 1: Trainning meta model (simple Linear Regression)
meta = LinearRegression()
meta.fit(stacked_X_train, y_train)

# Applying to validation:
base1_val_pred = base1.predict(X_val)
base2_val_pred = base2.predict(X_val)

stacked_X_val = np.vstack((base1_val_pred, base2_val_pred)).T
stacked_pred = meta.predict(stacked_X_val)

# Evaluating
mse_stacked = mean_squared_error(y_val, stacked_pred)

print("\n[Stacking Results]")
print(f"Base1 Linear Regression MSE: {mean_squared_error(y_val, pred1):.2f}")
print(f"Base2 Decision Tree MSE: {mean_squared_error(y_val, pred3):.2f}")
print(f"Stacked Model MSE: {mse_stacked:.2f}")


[Stacking Results]
Base1 Linear Regression MSE: 2942066921.67
Base2 Decision Tree MSE: 2169961248.67
Stacked Model MSE: 2192399562.22
