# Notebook 3: Baseline Models

**Notebook Purpose**
This notebook establishes baseline performance benchmarks:
1. Train simple, default, baseline models (mean predictor, linear regression, decision tree)
2. Train Random Forest with default hyperparameters
3. Evaluate all models on validation set
4. Establish performance benchmarks for comparison with optimized models

**Key Outputs**
- Baseline performance metrics (RMSE, MAE, R²)
- Performance benchmarks for evaluating more complex models

**Data Leakage Prevention**
All models evaluated on validation set only. Test set remains untouched. These baselines help determine if more complex models provide meaningful improvements.

## Library Imports

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Ingestion

In [None]:
#load in baseline data
training_set_unedited_file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Unprocessed/Training Set.csv'
validation_set_unedited__file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Unprocessed/Validation Set.csv'
testing_set_unedited__file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Unprocessed/Testing Set.csv'


training_df_base = pd.read_csv(training_set_unedited_file_path)
validation_df_base = pd.read_csv(validation_set_unedited__file_path)
testing_df_base = pd.read_csv(testing_set_unedited__file_path)

training_df_base.head()

Unnamed: 0,Cast_id,Latitude,Longitude,Year,Month,Day,Bottom_depth,Depth,Oxygen,Date,...,Rrs_412,Rrs_443,Rrs_469,Rrs_488,Rrs_531,Rrs_547,Rrs_555,Rrs_645,Rrs_667,Rrs_678
0,16494571,54.623,13.028,2002,7,4,12.0,1.5,278.0,2002-07-04,...,0.002476,0.00197,0.002248,0.0022,0.002864,0.002758,0.002608,0.000644,0.000538,0.000604
1,13547703,54.596,18.7737,2002,7,4,49.0,0.0,322.0,2002-07-04,...,0.002618,0.002244,0.002216,0.00224,0.00292,0.00279,0.00266,0.0005,0.000304,0.000334
2,13547702,54.5777,18.7477,2002,7,4,53.0,0.0,320.0,2002-07-04,...,0.002444,0.001922,0.002112,0.00193,0.002808,0.00287,0.00273,0.00056,0.000338,0.00035
3,13547701,54.57,18.68,2002,7,4,31.0,0.0,328.0,2002-07-04,...,0.00237,0.00186,0.002052,0.001876,0.002746,0.002794,0.002656,0.000522,0.000308,0.000322
4,13547700,54.5782,18.661,2002,7,4,33.0,0.0,328.0,2002-07-04,...,0.00237,0.00186,0.002052,0.001876,0.002746,0.002794,0.002656,0.000522,0.000308,0.000322


In [None]:
#load in edited data
training_set_file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Processed/Training Set.csv'
validation_set_file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Processed/Validation Set.csv'
testing_set_file_path = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 699/Data/Data Splits/Processed/Testing Set.csv'

training_df = pd.read_csv(training_set_file_path)
validation_df = pd.read_csv(validation_set_file_path)
testing_df = pd.read_csv(testing_set_file_path)

training_df.head()

Unnamed: 0,Latitude,Longitude,Year,Month,Day,chlor_a,poc,sst,Rrs_412,Rrs_443,...,sst_squared,sst_cubed,log_chlor_a,log_poc,ratio_443_547,ratio_443_555,sst_chlor_interaction,abs_latitude,season,Oxygen
0,54.623,13.028,2002,7,4,3.398178,274.399994,15.715,0.002476,0.00197,...,246.96123,3880.995764,1.223239,5.614587,0.714286,0.755368,19.223206,54.623,Summer,278.0
1,54.596,18.7737,2002,7,4,3.311782,242.399994,16.369999,0.002618,0.002244,...,267.976865,4386.780994,1.197486,5.490589,0.804301,0.843609,19.602851,54.596,Summer,322.0
2,54.5777,18.7477,2002,7,4,5.786841,294.200012,16.1,0.002444,0.001922,...,259.210012,4173.281297,1.755587,5.68426,0.669687,0.704029,28.264945,54.5777,Summer,320.0
3,54.57,18.68,2002,7,4,5.830627,295.600006,16.01,0.00237,0.00186,...,256.320107,4103.684977,1.763125,5.689007,0.665712,0.700301,28.227624,54.57,Summer,328.0
4,54.5782,18.661,2002,7,4,5.830627,295.600006,16.01,0.00237,0.00186,...,256.320107,4103.684977,1.763125,5.689007,0.665712,0.700301,28.227624,54.5782,Summer,328.0


## Baseline Models

### Linear Regression

In [None]:
#first lets drop uneed columns
def drop_cols(df, cols):
  df = df.drop(cols, axis=1)
  return df
for df in [training_df, validation_df, testing_df]:
    df=drop_cols(df, ['Year','Month',	'Day'])

In [None]:
from re import X
#next lets set up a normalization and basleine linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer

def linear_regression_model(training_df,validation_df,target_col='Oxygen'):
  X_train=training_df.drop(target_col, axis=1)
  y_train=training_df[target_col]
  X_val=validation_df.drop(target_col, axis=1)
  y_val=validation_df[target_col]
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", Pipeline([
              ("imputer", SimpleImputer(strategy="median")),
              ("scaler", StandardScaler())
          ]), selector(dtype_include=np.number)),
          ("cat", Pipeline([
              ("imputer", SimpleImputer(strategy="most_frequent")),
              ("ohe", OneHotEncoder(handle_unknown="ignore"))
          ]), selector(dtype_exclude=np.number)),
      ],
      remainder="drop"
  )



  lin_reg = LinearRegression()
  model = Pipeline([
      ("preprocessor", preprocessor),
      ("regressor", lin_reg)
  ])
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  mae_train = mean_absolute_error(y_train, y_pred_train)
  r2_train = r2_score(y_train, y_pred_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  mae = mean_absolute_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f'RMSE for training: {rmse_train}')
  print(f'R2 for training: {r2_train}')
  print(f'MAE for training: {mae_train}')

  print(f'RMSE: {rmse}')
  print(f'R2: {r2}')
  print(f'MAE: {mae}')
  return model


In [None]:
model_lin_reg = linear_regression_model(training_df, validation_df, target_col='Oxygen')

RMSE for training: 38.84510550925362
R2 for training: 0.5933968568368717
MAE for training: 22.665876029446952
RMSE: 42.27706291131593
R2: 0.5419247784922656
MAE: 26.03985155729286


Random forest

In [None]:

from sklearn.ensemble import RandomForestRegressor
def random_forest_model(training_df,validation_df,target_col='Oxygen'):
  X_train=training_df.drop(target_col, axis=1)
  y_train=training_df[target_col]
  X_val=validation_df.drop(target_col, axis=1)
  y_val=validation_df[target_col]
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", Pipeline([
              ("imputer", SimpleImputer(strategy="median")),
              ("scaler", StandardScaler())
          ]), selector(dtype_include=np.number)),
          ("cat", Pipeline([
              ("imputer", SimpleImputer(strategy="most_frequent")),
              ("ohe", OneHotEncoder(handle_unknown="ignore"))
          ]), selector(dtype_exclude=np.number)),
      ],
      remainder="drop"
  )



  rand_forest = RandomForestRegressor()
  model = Pipeline([
      ("preprocessor", preprocessor),
      ("regressor", rand_forest)
  ])
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  mae_train = mean_absolute_error(y_train, y_pred_train)
  r2_train = r2_score(y_train, y_pred_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  mae = mean_absolute_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f'RMSE for training: {rmse_train}')
  print(f'R2 for training: {r2_train}')
  print(f'MAE for training: {mae_train}')

  print(f'RMSE: {rmse}')
  print(f'R2: {r2}')
  print(f'MAE: {mae}')
  return model


In [None]:
model_rf = random_forest_model(training_df, validation_df, target_col='Oxygen')

RMSE for training: 11.576293663558301
R2 for training: 0.9638891932304414
MAE for training: 5.601697619306606
RMSE: 40.42544295073941
R2: 0.5811709869663576
MAE: 23.926049005536374


LGBM

In [None]:
!pip install lightgbm
from lightgbm import LGBMRegressor

def LGBM_model(training_df,validation_df,target_col='Oxygen'):
  X_train=training_df.drop(target_col, axis=1)
  y_train=training_df[target_col]
  X_val=validation_df.drop(target_col, axis=1)
  y_val=validation_df[target_col]
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", Pipeline([
              ("imputer", SimpleImputer(strategy="median")),
              ("scaler", StandardScaler())
          ]), selector(dtype_include=np.number)),
          ("cat", Pipeline([
              ("imputer", SimpleImputer(strategy="most_frequent")),
              ("ohe", OneHotEncoder(handle_unknown="ignore"))
          ]), selector(dtype_exclude=np.number)),
      ],
      remainder="drop"
  )



  LGBM = LGBMRegressor()
  model = Pipeline([
      ("preprocessor", preprocessor),
      ("regressor", LGBM)
  ])
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  mae_train = mean_absolute_error(y_train, y_pred_train)
  r2_train = r2_score(y_train, y_pred_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  mae = mean_absolute_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f'RMSE for training: {rmse_train}')
  print(f'R2 for training: {r2_train}')
  print(f'MAE for training: {mae_train}')

  print(f'RMSE: {rmse}')
  print(f'R2: {r2}')
  print(f'MAE: {mae}')
  return model



In [None]:
model_lgbm = LGBM_model(training_df, validation_df, target_col='Oxygen')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6185
[LightGBM] [Info] Number of data points in the train set: 31728, number of used features: 31
[LightGBM] [Info] Start training from score 268.322485




RMSE for training: 27.380447986395456
R2 for training: 0.7979870457130018
MAE for training: 15.508172309416235
RMSE: 39.671147787357654
R2: 0.5966549646904631
MAE: 23.047640771951556




Neural Network

In [None]:
from sklearn.neural_network import MLPRegressor

def NN_model(training_df,validation_df,target_col='Oxygen'):
  X_train=training_df.drop(target_col, axis=1)
  y_train=training_df[target_col]
  X_val=validation_df.drop(target_col, axis=1)
  y_val=validation_df[target_col]
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", Pipeline([
              ("imputer", SimpleImputer(strategy="median")),
              ("scaler", StandardScaler())
          ]), selector(dtype_include=np.number)),
          ("cat", Pipeline([
              ("imputer", SimpleImputer(strategy="most_frequent")),
              ("ohe", OneHotEncoder(handle_unknown="ignore"))
          ]), selector(dtype_exclude=np.number)),
      ],
      remainder="drop"
  )



  NN = MLPRegressor(
        hidden_layer_sizes=(256, 128,64),
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )

  model = Pipeline([
      ("preprocessor", preprocessor),
      ("regressor", NN)
  ])
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  mae_train = mean_absolute_error(y_train, y_pred_train)
  r2_train = r2_score(y_train, y_pred_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  mae = mean_absolute_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f'RMSE for training: {rmse_train}')
  print(f'R2 for training: {r2_train}')
  print(f'MAE for training: {mae_train}')

  print(f'RMSE: {rmse}')
  print(f'R2: {r2}')
  print(f'MAE: {mae}')
  return model

In [None]:
model_nn = NN_model(training_df, validation_df, target_col='Oxygen')

RMSE for training: 26.30576713762494
R2 for training: 0.8135338244501624
MAE for training: 15.693301819944438
RMSE: 55.67967303834677
R2: 0.20545127228900528
MAE: 38.59738871965813


XGboosting

In [None]:
from xgboost import XGBRegressor
def XGB_model(training_df,validation_df,target_col='Oxygen'):
  X_train=training_df.drop(target_col, axis=1)
  y_train=training_df[target_col]
  X_val=validation_df.drop(target_col, axis=1)
  y_val=validation_df[target_col]
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", Pipeline([
              ("imputer", SimpleImputer(strategy="median")),
              ("scaler", StandardScaler())
          ]), selector(dtype_include=np.number)),
          ("cat", Pipeline([
              ("imputer", SimpleImputer(strategy="most_frequent")),
              ("ohe", OneHotEncoder(handle_unknown="ignore"))
          ]), selector(dtype_exclude=np.number)),
      ],
      remainder="drop"
  )



  xgb = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        objective='reg:squarederror',
        n_jobs=-1
    )


  model = Pipeline([
      ("preprocessor", preprocessor),
      ("regressor", xgb)
  ])
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  mae_train = mean_absolute_error(y_train, y_pred_train)
  r2_train = r2_score(y_train, y_pred_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  mae = mean_absolute_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f'RMSE for training: {rmse_train}')
  print(f'R2 for training: {r2_train}')
  print(f'MAE for training: {mae_train}')

  print(f'RMSE: {rmse}')
  print(f'R2: {r2}')
  print(f'MAE: {mae}')
  return model

In [None]:
model_xgb = XGB_model(training_df, validation_df, target_col='Oxygen')

RMSE for training: 21.190632474654613
R2 for training: 0.8789998359173186
MAE for training: 12.41855064797849
RMSE: 39.61031102105686
R2: 0.597891096943769
MAE: 23.534162071890393
