<a href="https://colab.research.google.com/github/Undasnr/DL-ML/blob/main/Ronny_Ensembe_Learning_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Blending Scratch Mounting**

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Loading the dataset
df = pd.read_csv('train.csv')

# Selecting the features and target
X = df[['GrLivArea', 'YearBuilt']]
y = df['SalePrice']

# Splitting the data into 80% training and 20% validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data successfully loaded and split into training and validation sets.")
print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

# Training and Evaluating single models
# Initializing and training the single models
# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Model 2: Support Vector Regressor (SVR)
# We use a linear kernel and a reasonable C value for this example
svr_model = SVR(kernel='linear', C=100)
svr_model.fit(X_train, y_train)

# Model 3: Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Making predictions on the validation data
lr_preds = lr_model.predict(X_val)
svr_preds = svr_model.predict(X_val)
dt_preds = dt_model.predict(X_val)

# Calculating and printing the MSE for each single model
lr_mse = mean_squared_error(y_val, lr_preds)
svr_mse = mean_squared_error(y_val, svr_preds)
dt_mse = mean_squared_error(y_val, dt_preds)

print("\nSingle Model MSEs (for comparison):")
print(f"Linear Regression MSE: {lr_mse:.2f}")
print(f"Support Vector Regressor MSE: {svr_mse:.2f}")
print(f"Decision Tree Regressor MSE: {dt_mse:.2f}")

# Blending Scratch Implementation
# Blending scratch implementation: average the predictions of the three models
blended_preds = (lr_preds + svr_preds + dt_preds) / 3

# Calculating the MSE for the blended model
blended_mse = mean_squared_error(y_val, blended_preds)

print("\n--- Blending Results ---")
print(f"Blended Model MSE: {blended_mse:.2f}")

Data successfully loaded and split into training and validation sets.
Training data shape: (1168, 2)
Validation data shape: (292, 2)

Single Model MSEs (for comparison):
Linear Regression MSE: 2495554898.67
Support Vector Regressor MSE: 2682656031.23
Decision Tree Regressor MSE: 2184045784.67

--- Blending Results ---
Blended Model MSE: 1991209925.52


**2. Scratch mounting of bagging**

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Loading the dataset
df = pd.read_csv('train.csv')

# Selecting the features and target
X = df[['GrLivArea', 'YearBuilt']]
y = df['SalePrice']

# Splitting the data into 80% training and 20% validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data successfully loaded and split.")
print("-" * 30)

# Training a single Decision Tree Regressor
dt_model_single = DecisionTreeRegressor(random_state=42)
dt_model_single.fit(X_train, y_train)

# Making predictions and calculating MSE on the validation set
dt_preds_single = dt_model_single.predict(X_val)
dt_mse_single = mean_squared_error(y_val, dt_preds_single)

print("Single Model (Decision Tree) Performance:")
print(f"MSE: {dt_mse_single:.2f}")
print("-" * 30)

# Bagging implementation
n_models = 10
bagged_predictions = []

# Getting the number of samples in the training set
n_samples_train = X_train.shape[0]

# Converting dataframes to numpy arrays for easier indexing
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

for i in range(n_models):
    # Creating a bootstrap sample by random selection with replacement
    bootstrap_indices = np.random.choice(n_samples_train, size=n_samples_train, replace=True)
    X_bootstrap = X_train_np[bootstrap_indices]
    y_bootstrap = y_train_np[bootstrap_indices]

    # Training a new Decision Tree model on the bootstrap sample
    bagged_model = DecisionTreeRegressor(random_state=i) # Use a different random_state for diversity
    bagged_model.fit(X_bootstrap, y_bootstrap)

    # Making predictions on the original validation set
    bagged_preds_i = bagged_model.predict(X_val)
    bagged_predictions.append(bagged_preds_i)

# Converting the list of predictions to a NumPy array for easy averaging
bagged_predictions = np.array(bagged_predictions)

# Average the predictions across all models
final_bagged_preds = np.mean(bagged_predictions, axis=0)

# Calculating the MSE for the bagged model
bagged_mse = mean_squared_error(y_val, final_bagged_preds)

print("Bagged Model Performance:")
print(f"Bagged Model MSE (from {n_models} trees): {bagged_mse:.2f}")
print("-" * 30)

Data successfully loaded and split.
------------------------------
Single Model (Decision Tree) Performance:
MSE: 2184045784.67
------------------------------
Bagged Model Performance:
Bagged Model MSE (from 10 trees): 1841521921.24
------------------------------




**3. Stacking scratch mounting**

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

def train_stacked_ensemble(X_train, y_train, X_val, y_val):
    """
    Scratch implementation of a stacking ensemble for regression.

    Args:
        X_train (pd.DataFrame): The training features.
        y_train (pd.Series): The training target.
        X_val (pd.DataFrame): The validation features.
        y_val (pd.Series): The validation target.

    Returns:
        tuple: A tuple containing the final stacked predictions and the stacked MSE.
    """
    # Stage 0: Training base models and generating blended data (meta-features)
    # Using 3-fold cross-validation (K0=3)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    # Preparing arrays to store the meta-features and the target
    blended_data = np.zeros((X_train.shape[0], 2))

    # Creating a list to hold the trained base models for later use on validation data
    base_models = []

    print("Starting Stage 0: Training base models on K-Folds...")
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print(f"  Training on Fold {i+1}...")

        # Splitting training data into a sub-training set and a hold-out validation set
        X_sub_train, X_hold_out = X_train.iloc[train_index], X_train.iloc[val_index]
        y_sub_train, y_hold_out = y_train.iloc[train_index], y_train.iloc[val_index]

        # Base Model 1: Linear Regression
        lr_model = LinearRegression()
        lr_model.fit(X_sub_train, y_sub_train)
        lr_preds = lr_model.predict(X_hold_out)

        # Base Model 2: Decision Tree Regressor
        # The same model type is used in each fold to maintain consistency
        dt_model = DecisionTreeRegressor(random_state=42)
        dt_model.fit(X_sub_train, y_sub_train)
        dt_preds = dt_model.predict(X_hold_out)

        # Storing predictions in the blended data array
        blended_data[val_index, 0] = lr_preds
        blended_data[val_index, 1] = dt_preds

        # Saving the trained models for later use on the validation data
        base_models.append((lr_model, dt_model))
        print(f"  Fold {i+1} complete.")

    print("Stage 0 complete. Blended data (meta-features) created.")
    print(f"Blended data shape: {blended_data.shape}")
    print("-" * 30)

    # Stage 1: Training the meta-model on the blended data
    # Using a simple Linear Regression as our meta-model
    print("Starting Stage 1: Training meta-model...")
    meta_model = LinearRegression()
    meta_model.fit(blended_data, y_train)
    print("Stage 1 complete. Meta-model trained.")
    print("-" * 30)

    # Making predictions on the original validation set (X_val)
    # Generating "blend test" data for the validation set
    blend_test_data = np.zeros((X_val.shape[0], 2))

    print("Generating predictions on validation set...")
    # For each base model from each fold, make a prediction on the entire X_val set
    for lr_model, dt_model in base_models:
        blend_test_data[:, 0] += lr_model.predict(X_val)
        blend_test_data[:, 1] += dt_model.predict(X_val)

    # Average the predictions to get the final blend test data
    blend_test_data /= kf.get_n_splits()

    # Using the meta-model to make the final prediction
    final_stacked_preds = meta_model.predict(blend_test_data)

    # Calculating the MSE for the stacked model
    stacked_mse = mean_squared_error(y_val, final_stacked_preds)

    return final_stacked_preds, stacked_mse

# Main Script
try:
    # Loading the dataset
    df = pd.read_csv('train.csv')
    print("Dataset loaded successfully.")
    print(f"Initial number of rows: {len(df)}")

    # Explicitly ensure columns are numeric and handle missing values
    df['GrLivArea'] = pd.to_numeric(df['GrLivArea'], errors='coerce')
    df['YearBuilt'] = pd.to_numeric(df['YearBuilt'], errors='coerce')
    df['SalePrice'] = pd.to_numeric(df['SalePrice'], errors='coerce')

    # Dropping any rows with missing values
    df.dropna(subset=['GrLivArea', 'YearBuilt', 'SalePrice'], inplace=True)

    print(f"Number of rows after cleaning: {len(df)}")

    if len(df) == 0:
        print("\nError: All rows were removed during the cleaning process.")
        print("This indicates that the specified columns have missing or invalid data in every row.")
        # Raise an error to stop execution
        raise ValueError("Empty DataFrame after cleaning.")

except Exception as e:
    print(f"An error occurred: {e}")
    exit()

# Selecting the features and target
X = df[['GrLivArea', 'YearBuilt']]
y = df['SalePrice']

# Splitting the data into 80% training and 20% validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData successfully split for stacking.")
print("-" * 30)

# Getting baseline MSE for a single model (Decision Tree for comparison)
dt_model_single = DecisionTreeRegressor(random_state=42)
dt_model_single.fit(X_train, y_train)
dt_mse_single = mean_squared_error(y_val, dt_model_single.predict(X_val))

# Training and evaluating the stacked ensemble
final_stacked_preds, stacked_mse = train_stacked_ensemble(X_train, y_train, X_val, y_val)

print("Stacking Performance:")
print(f"Single Model (Decision Tree) MSE: {dt_mse_single:.2f}")
print(f"Stacked Model MSE: {stacked_mse:.2f}")

Dataset loaded successfully.
Initial number of rows: 1460
Number of rows after cleaning: 1460

Data successfully split for stacking.
------------------------------
Starting Stage 0: Training base models on K-Folds...
  Training on Fold 1...
  Fold 1 complete.
  Training on Fold 2...
  Fold 2 complete.
  Training on Fold 3...
  Fold 3 complete.
Stage 0 complete. Blended data (meta-features) created.
Blended data shape: (1168, 2)
------------------------------
Starting Stage 1: Training meta-model...
Stage 1 complete. Meta-model trained.
------------------------------
Generating predictions on validation set...
Stacking Performance:
Single Model (Decision Tree) MSE: 2184045784.67
Stacked Model MSE: 1976573063.77
