## Loading in dataset for checking templates

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

# Loading the dataset
df = sns.load_dataset('titanic')



## Preprocessing for model - refer to preprocessing template

In [2]:
from sklearn.impute import SimpleImputer

def impute_missing_values(df, strategy='mean', fill_value=None, columns=None):

  if columns is None:
    columns = df.columns # The imputer is applied to all columns in the dataset

  imputer = SimpleImputer(strategy=strategy, fill_value=fill_value) # Initializing the imputer
  df[columns] = imputer.fit_transform(df[columns]) # Applying the imputer to selected columns

  return df


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


def encode_categorical(df, columns, method='onehot'):

    if method == 'onehot':
        for col in columns:
            encoder = OneHotEncoder(sparse_output=False, drop='first') # Converting the spare matrix which is outputted by default into a dense array. Drop first to avoid multicolinearity which undermines the statistical significance of an independent variable
            encoded = encoder.fit_transform(df[[col]]) # The encoded column
            encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=df.index)
            df = pd.concat([df, encoded_df], axis=1) # Appending the encoded columns to the df
            df.drop(columns=col, inplace=True) # Dropping the original columns

    elif method == 'label':
        for col in columns:
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col]) # Replacing the original columns with the label encoded values

    return df


def remove_low_correlation_features(df, target_column, threshold=0.1):

    df = df.select_dtypes(include=['number']) # Filtering numeric columns only

    corr_with_target = df.corr()[target_column].abs() # Absolute correlation with the target
    low_corr_features = corr_with_target[corr_with_target < threshold].index.tolist()

    print(f'Features to drop (correlation with {target_column} < {threshold}): {low_corr_features}')
    return df.drop(columns=low_corr_features), corr_with_target


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def scale_data(df, scaler_type="standard", columns=None):

  scalers = {
      'standard': StandardScaler(),
      'minmax': MinMaxScaler(),
      'robust': RobustScaler()
  }
  scaler = scalers.get(scaler_type)


  if columns is None:
    columns = df.select_dtypes(include=['number']).columns # Default to numeric columns if no specific columns are provided


  scaled_df = pd.DataFrame(scaler.fit_transform(df[columns]), columns=columns)


  return scaled_df


In [3]:
df = df.drop(columns=['embarked', 'embark_town', 'deck', 'alive'], axis=1)
df = impute_missing_values(df, strategy="mean", columns=['age'])
df = encode_categorical(df, columns=['sex', 'adult_male', 'who', 'alone'], method='onehot')
df = encode_categorical(df, columns=['class'], method='label')
df, corr_with_target = remove_low_correlation_features(df, target_column='survived', threshold=0.1)
X = df.drop(columns=['survived'], axis=1)
y = df['survived']
X = scale_data(X)
print('\nScaled X Values\n', X.head())

Features to drop (correlation with survived < 0.1): ['age', 'sibsp', 'parch']

Scaled X Values
      pclass      fare     class  sex_male  adult_male_True   who_man  \
0  0.827377 -0.502445  0.827377  0.737695         0.811922  0.811922   
1 -1.566107  0.786845 -1.566107 -1.355574        -1.231645 -1.231645   
2  0.827377 -0.488854  0.827377 -1.355574        -1.231645 -1.231645   
3 -1.566107  0.420730 -1.566107 -1.355574        -1.231645 -1.231645   
4  0.827377 -0.486337  0.827377  0.737695         0.811922  0.811922   

   who_woman  alone_True  
0  -0.661133   -1.231645  
1   1.512555   -1.231645  
2   1.512555    0.811922  
3   1.512555   -1.231645  
4  -0.661133    0.811922  


## Single Model + HPT

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, mean_absolute_error, r2_score

def single_model_hpt(X, y, model, param_grid, search_type='random', model_type='class', cv=3):
  '''
  Performs hyperparameter tuning for a single classification or regression model using either grid or random search.

  Parameters:
    X: Pandas df or numpy array of the features
    y: Pandas df or numpy array of the target
    model: The initialized model
    param_grid: A dictionary of hyperparameters to tune for the model
    search_type: Type-str, Default-random, Options - grid, random. The type of search to be carried out on the hyperparams
    model_type: Tpe-str, Default-class, Options - class, reg. Whether the model type is a classification or regression model
    cv: Type-int, Default-3. The number of cross validation folds. Can change after doing k fold cv, might incorporate in later.

  Returns:
    best_model: The tuned model with the best hyperparams
    best_params: A dictionary of the chosen best hyperparams
    best_score: A float of the best cross-validation score
  '''

  # Splitting into the train_test_split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Selecting the search method
  if search_type == 'random':
    search = RandomizedSearchCV(model, param_grid, cv=cv, scoring='accuracy' if model_type == 'class' else 'neg_mean_absolute_error', verbose=1, n_jobs=-1)
  elif search_type == 'grid':
    search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy' if model_type == 'class' else 'neg_mean_absolute_error', verbose=1, n_jobs=-1)

  # Fitting the search to the features
  search.fit(X_train, y_train)

  # Results of the search
  best_model = search.best_estimator_
  best_params = search.best_params_
  best_score = search.best_score_

  print("Best Parameters:", best_params)
  print("Best Cross-Validation Score:", best_score)

  # Test set evaluation
  y_pred = best_model.predict(X_test)

  # Evaluation Metrics

  # Classification Evaluation
  if model_type == 'class':
    print("\nTest Set Classification Report:")
    print(classification_report(y_test, y_pred))

  # Regression evaluation
  elif model_type == 'reg':
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("\nTest Set Regression Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R2 Score: {r2}")

  return best_model, best_params, best_score



### Testing the single model hpt

In [5]:
from sklearn.tree import DecisionTreeClassifier

# Model and parameter grid
model = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform hyperparameter tuning
best_model, best_params, best_score = single_model_hpt(X, y, model, param_grid, search_type="random", model_type="class")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}
Best Cross-Validation Score: 0.8160302095521753

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [6]:
# Splitting into the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on test set
y_test_pred = best_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.8212290502793296


## Multi Model Comparision + HPT

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, mean_absolute_error, r2_score

def compare_models_hpt(X, y, models, search_type='random', model_type='class', cv=3):
  '''
  Performs hyperparameter tuning on multiple models and selects the best one with tuned hp's.

  Parameters:
    X: Pandas df or numpy array of the features
    y: Pandas df or numpy array of the target
    model: Type-dict, A dict where the keys are the model names and the values are tuples with the params to tune (model, param_grid)
    search_type: Type-str, Default-random, Options - grid, random. The type of search to be carried out on the hyperparams
    model_type: Type-str, Default-class, Options - class, reg. Whether the model type is a classification or regression model
    cv: Type-int, Default-3. The number of cross validation folds. Can change after doing k fold cv, might incorporate in later.

  Returns:
    best_model: The tuned model with the best hyperparams
    best_params: A dictionary of the chosen best hyperparams
    best_score: A float of the best cross-validation score
  '''

  # Splitting into the train_test_split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  results = [] # Empty list to the store the results of each model

  for model_name, (model, param_grid) in models.items():
    print(f"Running hyperparameter tuning for {model_name}...")

    # Selecting search method
    if search_type == 'random':
      search = RandomizedSearchCV(model, param_grid, cv=cv, scoring='accuracy' if model_type == 'class' else 'neg_mean_absolute_error', verbose=1, n_jobs=-1)
    elif search_type == 'grid':
      search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy' if model_type == 'class' else 'neg_mean_absolute_error', verbose=1, n_jobs=-1)

    # Fitting the search
    search.fit(X_train, y_train)

    # Getting the results for this model
    best_model = search.best_estimator_
    best_params = search.best_params_
    best_score = search.best_score_

    print(f"Best Parameters for {model_name}: {best_params}")
    print(f"Best Cross-Validation Score for {model_name}: {best_score}")

    # Appending the results to the empty list
    results.append((model_name, best_model, best_params, best_score))

  # Finding the best model based on the scores
  best_model_name, best_model, best_params, best_score = max(results, key=lambda x: x[3]) # Sorting by the score values

  print(f"\nBest Model: {best_model_name}")
  print(f"Best Parameters: {best_params}")
  print(f"Best Cross-Validation Score: {best_score}")

  # Test set evaluation for the best model
  y_pred = best_model.predict(X_test)

  if model_type == 'class':
    print("\nTest Set Classification Report:")
    print(classification_report(y_test, y_pred))
  elif model_type == 'reg':
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("\nTest Set Regression Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R2 Score: {r2}")

  return best_model, best_model_name, best_params, best_score



### Testing the multi model on classification and regression models

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Defining the models and parameter grids
models = {
    'DecisionTree': (
        DecisionTreeClassifier(random_state=42),
        {'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 5, 10]}
    ),
    'RandomForest': (
        RandomForestClassifier(random_state=42),
        {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
    )
}

# Comparing the classification models and select the best one
best_model, best_model_name, best_params, best_score = compare_models_hpt(X, y, models, search_type="grid", model_type="class", cv=3)


'''
This example is a classification one so regression test not needed but here as an example.

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Define models and parameter grids
models = {
    'LinearRegression': (
        LinearRegression(),
        {}
    ),
    'RandomForestRegressor': (
        RandomForestRegressor(random_state=42),
        {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    )
}

# Comparing the regression models and select the best one
best_model, best_model_name, best_params, best_score = compare_models_hpt(X, y, models, search_type="random", model_type="reg", cv=3)
'''



Running hyperparameter tuning for DecisionTree...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best Parameters for DecisionTree: {'max_depth': 4, 'min_samples_split': 2}
Best Cross-Validation Score for DecisionTree: 0.8174484983866964
Running hyperparameter tuning for RandomForest...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Parameters for RandomForest: {'max_depth': 10, 'n_estimators': 50}
Best Cross-Validation Score for RandomForest: 0.7921261331536834

Best Model: DecisionTree
Best Parameters: {'max_depth': 4, 'min_samples_split': 2}
Best Cross-Validation Score: 0.8174484983866964

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



'\nThis example is a classification one so regression test not needed but here as an example.\n\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestRegressor\n\n# Define models and parameter grids\nmodels = {\n    \'LinearRegression\': (\n        LinearRegression(),\n        {}\n    ),\n    \'RandomForestRegressor\': (\n        RandomForestRegressor(random_state=42),\n        {\'n_estimators\': [50, 100, 200], \'max_depth\': [None, 10, 20]}\n    )\n}\n\n# Comparing the regression models and select the best one\nbest_model, best_model_name, best_params, best_score = compare_models_hpt(X, y, models, search_type="random", model_type="reg", cv=3)\n'

In [10]:
# Splitting into the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on test set
y_test_pred = best_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8212290502793296


## Stacked Model - No HPT

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, mean_absolute_error, r2_score

def stacked_model(X, y, base_models, meta_model, model_type='class', n_folds=3):
  '''
  Stacks multiple base models with a meta-model for the final prediction.

  Parameters:
    X: Pandas df or numpy array of the features
    y: Pandas df or numpy array of the target
    base_models: Type-list, The list of the base models to be stacked and trained
    meta_model: The model used for the final prediction. Takes the predicted data from the base_models and estimates on that
    model_type: Type-str, Default-class, Options - class, reg. Whether the model type is a classification or regression model
    n_folds: Type-int, Default-3, The number of cross validation folds - Can tune later

  Returns:
    meta_model: The trained meta model
    base_model_preds: Predictions from base models (Level 1 data)
    final_score: The final score given from the test set
  '''

  # Ensuring X and y are NumPy arrays
  X = np.array(X)
  y = np.array(y)

  # k-fold cross validation
  kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

  # An array of zeros made to store the predictions from each of the base models
  base_model_preds = np.zeros((X.shape[0], len(base_models)))

  # Training each base model using the K-Fold CV
  for i, model in enumerate(base_models):
    print(f"Training base model {i+1}: {model.__class__.__name__}")
    fold_preds = np.zeros(X.shape[0]) # Storing the model's predictions for this fold

    for train_idx, val_idx in kf.split(X):
      X_train, X_val = X[train_idx], X[val_idx]
      y_train, y_val = y[train_idx], y[val_idx]

      model.fit(X_train, y_train)
      fold_preds[val_idx] = model.predict(X_val)

    base_model_preds[:, i] = fold_preds # Storing predictions for this model

  # Training the meta-model on the predictions of base models (Level 1 data)
  if model_type == 'class':
    meta_model.fit(base_model_preds, y)
    meta_preds = meta_model.predict(base_model_preds)
    final_score = classification_report(y, meta_preds)
    print(f"\nFinal Meta-Model Accuracy:\n {final_score}")

  elif model_type == 'reg':
    meta_model.fit(base_model_preds, y)
    meta_preds = meta_model.predict(base_model_preds)
    mae = mean_absolute_error(y, meta_preds)
    r2 = r2_score(y, meta_preds)
    final_score = {"MAE": mae, "R2": r2}
    print(f"\nFinal Meta-Model Regression Metrics:\n {final_score}")

  return meta_model, base_model_preds, final_score


### Testing the stacked model

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Base models
base_models = [
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(n_estimators=50, random_state=42)
]

# Meta-model
meta_model = LogisticRegression()

# Performing stacking
meta_model, base_model_preds, final_score = stacked_model(X, y, base_models, meta_model, model_type='class', n_folds=3)


'''
This example is a classification one so regression test not needed but here as an example.

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Feature matrix and target
X = df.drop(columns=['target_column']).values
y = df['target_column'].values

# Base models
base_models = [
    LinearRegression(),
    RandomForestRegressor(n_estimators=100, random_state=42)
]

# Meta-model
meta_model = RandomForestRegressor(n_estimators=50, random_state=42)

# Perform stacking
meta_model, base_model_preds, final_score = stacked_model(X, y, base_models, meta_model, model_type='reg', n_folds=3)
'''


Training base model 1: DecisionTreeClassifier
Training base model 2: RandomForestClassifier

Final Meta-Model Accuracy:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       549
           1       0.83      0.69      0.75       342

    accuracy                           0.83       891
   macro avg       0.83      0.80      0.81       891
weighted avg       0.83      0.83      0.82       891



"\nThis example is a classification one so regression test not needed but here as an example.\n\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestRegressor\n\n# Feature matrix and target\nX = df.drop(columns=['target_column']).values\ny = df['target_column'].values\n\n# Base models\nbase_models = [\n    LinearRegression(),\n    RandomForestRegressor(n_estimators=100, random_state=42)\n]\n\n# Meta-model\nmeta_model = RandomForestRegressor(n_estimators=50, random_state=42)\n\n# Perform stacking\nmeta_model, base_model_preds, final_score = stacked_model(X, y, base_models, meta_model, model_type='reg', n_folds=3)\n"

In [None]:
from sklearn.model_selection import train_test_split

# Splitting into the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generating predictions from base models for the test set
base_model_preds_test = np.zeros((X_test.shape[0], len(base_models)))

for i, model in enumerate(base_models):
    base_model_preds_test[:, i] = model.predict(X_test)

# Make final predictions using the meta-model
y_test_pred = meta_model.predict(base_model_preds_test)

# Evaluate performance
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)


## Voting Model - No HPT

In [14]:
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, classification_report

def voting_model(X, y, models, model_type='class', voting='soft'):
  '''
  A voting ensemble model which combines multiple models and uses a majority vote (for classification) or an average (for regression) to make predictions.

  Parameters:
    X: Pandas df or numpy array of the features
    y: Pandas df or numpy array of the target
    models: Type-list of tuples, A list of the (name, model) tuples that will vote
    model_type: Type-str, Default-class, Options - class, reg. Whether the model type is a classification or regression model
    voting: Type-str, Default-soft, Options - Soft or Hard Voting style. Soft is for classification only, both can use Hard but classification defaults to soft

  Info:
    Hard Voting: Each model predicts a class (for classification tasks), and the final prediction is the class with the majority vote
    Soft Voting: Instead of predicting classes directly, models output probabilities for each class. The final prediction is based on the average probability across models

  Returns:
    model: The trained voting model
    Evaluation metrics: Experimenting having the function return the eval metrics - might remove out
  '''

  # Train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Creating the voting model
  if model_type == 'class':
    model = VotingClassifier(estimators=models, voting=voting, n_jobs=-1)
  elif model_type == 'reg':
    model = VotingRegressor(estimators=models, n_jobs=-1)

  # Training the model
  model.fit(X_train, y_train)

  # Predicting and evaluating
  y_pred = model.predict(X_test)

  if model_type == 'class':
    accuracy = accuracy_score(y_test, y_pred)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return model, {"Accuracy": accuracy}

  elif model_type == 'reg':
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}, R2 Score: {r2}")
    return model, {"MAE": mae, "R2": r2}




### Testing the voting model

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Define estimators
models = [
    ('DecisionTree', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('RandomForest', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('LogisticRegression', LogisticRegression())
]

# Train and evaluate VotingClassifier
voting_model, metrics = voting_model(X, y, models, model_type='class', voting='soft')



Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       105
           1       0.86      0.73      0.79        74

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



## Bagging Model - No HPT

In [16]:
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, classification_report

def bagging_model(X, y, base_model, model_type='class', n_estimators=10, max_samples=0.8):
  '''
  Bagging model to improve a models performance by training the same base model on different random subsets of the data.

  Parameters:
    X: Pandas df or numpy array of the features
    y: Pandas df or numpy array of the target
    base_model: The model that will be bagged and improved
    model_type: Type-str, Default-class, Options - class, reg. Whether the model type is a classification or regression model
    n-estimators: Type-int, Default-10, The number of base estimators
    max_samples: Type-int, Default-0.8, The fraction of samples for each estimator

  Returns:
    model: The trained bagging model
    Evaluation metrics: Experimenting having the function return the eval metrics - might remove out
  '''

  # Train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Creating the bagging model
  if model_type == 'class':
    model = BaggingClassifier(base_model, n_estimators=n_estimators, max_samples=max_samples, random_state=42)
  elif model_type == 'reg':
    model = BaggingRegressor(base_model, n_estimators=n_estimators, max_samples=max_samples, random_state=42)

  # Training the model
  model.fit(X_train, y_train)

  # Predicting and evaluating
  y_pred = model.predict(X_test)

  if model_type == 'class':
    accuracy = accuracy_score(y_test, y_pred)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return model, {"Accuracy": accuracy}

  elif model_type == 'reg':
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}, R2 Score: {r2}")
    return model, {"MAE": mae, "R2": r2}


### Testing the bagging model

In [17]:
from sklearn.tree import DecisionTreeClassifier

# Example for BaggingClassifier
bagging_model, metrics = bagging_model(X, y, base_model=DecisionTreeClassifier(), model_type='class')


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

