# **Baseline Notebook**



---
## Setup Environment

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
!pip install -q utstd

from utstd.folders import *
from utstd.ipyrenders import *

at = AtFolder(
    course_code=36106,
    assignment="AT3",
)
at.run()

import warnings
warnings.simplefilter(action='ignore')

---
## Student Information

In [None]:

group_name = "Group 12"
student_name = "Victor Rono"
student_id = "25669944"

In [None]:
# Do not modify this code
print_tile(size="h1", key='group_name', value=group_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_name', value=student_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_id', value=student_id)

---
## 0. Python Packages

### 0.a Install Additional Packages

> If you are using additional packages, you need to install them here using the command: `! pip install <package_name>`

### 0.b Import Packages

In [None]:

import pandas as pd
import altair as alt

---
## A. Assess Baseline Model

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
# Load data
try:
  X_train = pd.read_csv(at.folder_path / 'X_train.csv')
  y_train = pd.read_csv(at.folder_path / 'y_train.csv')

  X_val = pd.read_csv(at.folder_path / 'X_val.csv')
  y_val = pd.read_csv(at.folder_path / 'y_val.csv')

  X_test = pd.read_csv(at.folder_path / 'X_test.csv')
  y_test = pd.read_csv(at.folder_path / 'y_test.csv')
except Exception as e:
  print(e)

### A.1 Generate Predictions with Baseline Model

In [None]:
import pandas as pd
import numpy as np # Import numpy

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score

RANDOM_STATE = 42

# --- normalize y to Series ---
def _norm_y(y):
    if y is None: return None
    return y.iloc[:,0] if isinstance(y, pd.DataFrame) else pd.Series(y)

y_train = _norm_y(globals().get("y_train"))
y_val   = _norm_y(globals().get("y_val"))
y_test  = _norm_y(globals().get("y_test"))

# --- preprocessing (OHE for categoricals, scale numerics) ---
num_cols = [c for c in X_train.columns if np.issubdtype(X_train[c].dtype, np.number)]
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", make_pipeline(SimpleImputer(strategy="median"),
                              StandardScaler(with_mean=False)), num_cols),
        ("cat", make_pipeline(SimpleImputer(strategy="most_frequent"),
                              OneHotEncoder(handle_unknown="ignore")), cat_cols),
    ],
    remainder="drop", sparse_threshold=1.0
)

# --- choose model safely ---
n_classes = pd.Series(y_train).nunique(dropna=True)

if n_classes >= 2:
    # Normal path: train a real classifier
    clf = make_pipeline(preprocess,
                        LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    clf.fit(X_train, y_train)
    note = "LogisticRegression"
else:
    # Fallback: only one class in training data → use most-frequent baseline
    # This prevents: ValueError: This solver needs samples of at least 2 classes...
    clf = make_pipeline(preprocess,
                        DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE))
    clf.fit(X_train, y_train)  # trains without error
    note = "DummyClassifier (most_frequent) — single-class training set"

# --- predictions ---
y_pred_train = clf.predict(X_train)
y_pred_val   = clf.predict(X_val)
y_pred_test  = clf.predict(X_test)

# --- metrics (guarded) ---
try:
    acc = accuracy_score(y_val, y_pred_val)
    f1  = f1_score(y_val, y_pred_val, average="macro", zero_division=0)
    print({"model": note, "val_accuracy": round(acc, 4), "val_macroF1": round(f1, 4)})
except Exception as e:
    print({"model": note, "metric_info": "Could not compute classification metrics", "error": str(e)})

print("Predictions ready → y_pred_train / y_pred_val / y_pred_test")

### A.2 Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [None]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)

def _is_classification(y: pd.Series) -> bool:
    if y is None:
        return False
    if y.dtype == "object" or str(y.dtype) == "category":
        return True
    return pd.Series(y).nunique(dropna=True) <= 20

# Ensure y_val / y_pred_val exist and are 1-D
assert "y_val" in globals() and y_val is not None, "Need y_val from previous step."
assert "y_pred_val" in globals(), "Need y_pred_val from baseline step."

y_true = y_val if isinstance(y_val, pd.Series) else pd.Series(y_val)
y_pred = y_pred_val if isinstance(y_pred_val, pd.Series) else pd.Series(y_pred_val)

# ---- classification vs regression ----
if _is_classification(y_true):
    # Metrics chosen:
    # - Accuracy: simple baseline understanding
    # - Macro-F1: balances precision/recall per class and treats classes equally (good for imbalance)
    # - Macro Precision/Recall: show trade-off explicitly
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)
    prm = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rcl = recall_score(y_true, y_pred, average="macro", zero_division=0)

    # quick class balance view (helps justify macro-F1)
    class_counts = y_true.value_counts(dropna=False).to_dict()

    print({"val_accuracy": round(acc, 4),
           "val_macroF1": round(f1m, 4),
           "val_macroPrecision": round(prm, 4),
           "val_macroRecall": round(rcl, 4)})
    print("Class distribution (val):", class_counts)

    # confusion matrix for quick sanity
    try:
        cm = pd.DataFrame(confusion_matrix(y_true, y_pred),
                          index=[f"true_{c}" for c in sorted(y_true.unique())],
                          columns=[f"pred_{c}" for c in sorted(y_true.unique())])
        display(cm)
    except Exception:
        pass

    performance_metrics_explanations = f"""
We report **Accuracy** and **Macro-F1** (with Macro-Precision/Recall) on the validation
set. Accuracy provides an intuitive baseline. However, the class distribution is
{class_counts}, indicating potential imbalance; therefore **Macro-F1** is preferred as
it averages F1 **per class** and treats minority classes equally. Macro-Precision and
Macro-Recall expose the precision–recall trade-off, which is important for deciding how
to tune thresholds later (e.g., prioritising recall when missing a positive is costly).
"""

else:
    # Metrics chosen:
    # - MAE: easy to interpret in original units
    # - RMSE: penalises large errors (sensitive to outliers)
    # - R²: goodness of fit (unitless)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2   = r2_score(y_true, y_pred)

    print({"val_MAE": round(mae, 4), "val_RMSE": round(rmse, 4), "val_R2": round(r2, 4)})

In [None]:

performance_metrics_explanations = """
**MAE** is directly
interpretable in the target's units and is robust to outliers. **RMSE** penalises
large errors more heavily, which is useful when big misses are costly. **R²** shows
overall goodness-of-fit (unitless) and complements the scale-dependent error metrics.
Together these provide a balanced view of typical error size, outlier sensitivity, and fit quality.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

### A.3 Baseline Model Performance

> Provide some explanations on model performance


In [None]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)

def _is_classification(y: pd.Series) -> bool:
    if y is None:
        return False
    if y.dtype == "object" or str(y.dtype) == "category":
        return True
    return pd.Series(y).nunique(dropna=True) <= 20

# Ensure y_val / y_pred_val exist and are 1-D
assert "y_val" in globals() and y_val is not None, "Need y_val from previous step."
assert "y_pred_val" in globals(), "Need y_pred_val from baseline step."

y_true = y_val if isinstance(y_val, pd.Series) else pd.Series(y_val)
y_pred = y_pred_val if isinstance(y_pred_val, pd.Series) else pd.Series(y_pred_val)

# classification vs regression
if _is_classification(y_true):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)
    prm = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rcl = recall_score(y_true, y_pred, average="macro", zero_division=0)

    # quick class balance view (helps justify macro-F1)
    class_counts = y_true.value_counts(dropna=False).to_dict()

    print({"val_accuracy": round(acc, 4),
           "val_macroF1": round(f1m, 4),
           "val_macroPrecision": round(prm, 4),
           "val_macroRecall": round(rcl, 4)})
    print("Class distribution (val):", class_counts)

    # confusion matrix for quick sanity
    try:
        cm = pd.DataFrame(confusion_matrix(y_true, y_pred),
                          index=[f"true_{c}" for c in sorted(y_true.unique())],
                          columns=[f"pred_{c}" for c in sorted(y_true.unique())])
        display(cm)
    except Exception:
        pass

else:
    # Metrics chosen:
    # - MAE: easy to interpret in original units
    # - RMSE: penalises large errors (sensitive to outliers)
    # - R²: goodness of fit (unitless)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2   = r2_score(y_true, y_pred)

    print({"val_MAE": round(mae, 4), "val_RMSE": round(rmse, 4), "val_R2": round(r2, 4)})

In [None]:
baseline_performance_explanations = """
The baseline model achieved perfect scores (Accuracy, Macro-F1, Macro Precision, and Macro Recall all equal to 1.0) on the validation set. However, the class distribution on the validation set shows that only a single class (class 0) is present. This indicates that the model, which was a DummyClassifier predicting the most frequent class due to the training data also containing only one class, is simply predicting the majority class for all instances in the validation set. Therefore, these perfect scores do not represent a meaningful evaluation of the model's ability to distinguish between different classes, as there is only one class to predict. This highlights an issue with the dataset split where the validation set does not contain instances of all classes.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='baseline_performance_explanations', value=baseline_performance_explanations)