<a href="https://colab.research.google.com/github/WalterPHD/Ai-Data/blob/main/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#Loading the CSV
df = pd.read_csv('train.csv')

#Selecting columns
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'

#Dropped rows with missing values in the selected columns
df = df[features + [target]].dropna()

#Separating features and target
X = df[features].values
y = df[target].values

#80/20 split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Veryfying shape to confirm
print(f"Training set: {X_train.shape}, Validation set: {X_valid.shape}")


Training set: (1168, 2), Validation set: (292, 2)


# Problem 1 - Blending scratch mounting

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#Loading and clean data
df = pd.read_csv('train.csv')
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
df = df[features + [target]].dropna()

#Feature + target split
X = df[features].values
y = df[target].values

#Log-transform target for model 1
y_log = np.log1p(y)

# 80/20 split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_log_train, y_log_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)

#Linear Regression (with log transform + standardization)
scaler1 = StandardScaler()
X_train_std1 = scaler1.fit_transform(X_train)
X_valid_std1 = scaler1.transform(X_valid)

model1 = LinearRegression()
model1.fit(X_train_std1, y_log_train)
pred1 = np.expm1(model1.predict(X_valid_std1))

#Model 2: Support Vector Regressor (RBF kernel)
scaler2 = StandardScaler()
X_train_std2 = scaler2.fit_transform(X_train)
X_valid_std2 = scaler2.transform(X_valid)

model2 = SVR(kernel='rbf', C=100, epsilon=0.1)
model2.fit(X_train_std2, y_train)
pred2 = model2.predict(X_valid_std2)

#Model 3: Decision Tree (no scaling + raw data)
model3 = DecisionTreeRegressor(max_depth=5, random_state=42)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_valid)

#Meta-Model: Ridge Regression on stacked predictions
meta_X_train = np.column_stack([pred1, pred2, pred3])
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_X_train, y_valid)

#Final Predictions
final_pred = meta_model.predict(meta_X_train)
mse_blending = mean_squared_error(y_valid, final_pred)

#Base Model for Comparison
base_model = LinearRegression()
base_model.fit(X_train_std1, y_train)
mse_base = mean_squared_error(y_valid, base_model.predict(X_valid_std1))

#Results
print(f"Base Linear Regression MSE: {mse_base:.2f}")
print(f"Blending Ensemble (Diverse Models) MSE: {mse_blending:.2f}")


Base Linear Regression MSE: 2495554898.67
Blending Ensemble (Diverse Models) MSE: 1616660017.89


# Problem 2 - Bagging scratch mounting

Scratch bagging and Linear model

In [3]:
class BaggingRegressor:
    def __init__(self, base_model_class, n_estimators=10, sample_size=None):
        self.base_model_class = base_model_class
        self.n_estimators = n_estimators
        self.sample_size = sample_size  # Number of samples in each bag
        self.models = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        self.sample_size = self.sample_size or n_samples
        self.models = []

        for _ in range(self.n_estimators):
            indices = np.random.choice(n_samples, self.sample_size, replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            model = self.base_model_class()
            model.fit(X_sample, y_sample)
            self.models.append(model)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        return predictions.mean(axis=0)


Scratch LinearRegression for testing

In [4]:
import numpy as np

class ScratchLinearRegression:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y
        self.intercept_ = theta[0]
        self.coef_ = theta[1:]

    def predict(self, X):
        return X @ self.coef_ + self.intercept_


Comparison

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#Loading dataset
df = pd.read_csv('train.csv')
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
df = df[features + [target]].dropna()

X = df[features].values
y = df[target].values

#80/20
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

#Single base model (for comparison)
single_model = ScratchLinearRegression()
single_model.fit(X_train, y_train)
y_pred_single = single_model.predict(X_valid)
mse_single = mean_squared_error(y_valid, y_pred_single)

#Bagging with 10 base models
bagging_model = BaggingRegressor(ScratchLinearRegression, n_estimators=10)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_valid)
mse_bagging = mean_squared_error(y_valid, y_pred_bagging)

print(f"Single Model MSE: {mse_single:.2f}")
print(f"Bagging Model MSE: {mse_bagging:.2f}")


Single Model MSE: 2495554898.67
Bagging Model MSE: 2473851467.00


#Problem 3 - Stacking scratch mounting

Stacking scrattch code & LinerRegression code

In [6]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split


class ScratchLinearRegression:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y
        self.intercept_ = theta[0]
        self.coef_ = theta[1:]

    def predict(self, X):
        return X @ self.coef_ + self.intercept_


class StackingRegressor:
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y):
        self.base_models_fitted = [[] for _ in self.base_models]
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)

        # Out-of-fold predictions for training meta-model
        oof_predictions = np.zeros((X.shape[0], len(self.base_models)))

        for i, base_model_class in enumerate(self.base_models):
            for train_idx, val_idx in kf.split(X, y):
                X_train_k, X_val_k = X[train_idx], X[val_idx]
                y_train_k = y[train_idx]

                model = base_model_class()
                model.fit(X_train_k, y_train_k)
                y_pred = model.predict(X_val_k)

                oof_predictions[val_idx, i] = y_pred
                self.base_models_fitted[i].append(model)

        # Train meta-model on out-of-fold predictions
        self.meta_model.fit(oof_predictions, y)

    def predict(self, X):
        meta_features = np.zeros((X.shape[0], len(self.base_models)))

        for i, models in enumerate(self.base_models_fitted):
            preds = [model.predict(X) for model in models]
            meta_features[:, i] = np.mean(preds, axis=0)

        return self.meta_model.predict(meta_features)


# Compariison of Blending, Bagging, Stacking

In [None]:


#Single Model
base = ScratchLinearRegression()
base.fit(X_train, y_train)
pred_base = base.predict(X_valid)
mse_base = mean_squared_error(y_valid, pred_base)

#Bagging
bag = BaggingRegressor(ScratchLinearRegression, n_estimators=10)
bag.fit(X_train, y_train)
pred_bag = bag.predict(X_valid)
mse_bag = mean_squared_error(y_valid, pred_bag)

#Stacking
stack = StackingRegressor(
    base_models=[ScratchLinearRegression, ScratchLinearRegression],
    meta_model=Ridge(),
    n_folds=5
)
stack.fit(X_train, y_train)
pred_stack = stack.predict(X_valid)
mse_stack = mean_squared_error(y_valid, pred_stack)

#Results
print(f"Single Linear Regression MSE: {mse_base:.2f}")
print(f"Bagging Model MSE:             {mse_bag:.2f}")
print(f"Stacking Model MSE:            {mse_stack:.2f}")
