In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

df = pd.read_csv("../data/cleaned/final_merged_dataset.csv")

target = "Cases"

train_df = df[df["Year"] <= 2020].copy()
test_df  = df[df["Year"] > 2020].copy()

# Baseline: global mean (same prediction for all states and all crimes)
global_mean = train_df[target].mean()

test_df["pred"] = global_mean

y_test = test_df[target]
y_pred = test_df["pred"]

mse  = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print("===== GLOBAL MEAN BASELINE =====")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")


===== GLOBAL MEAN BASELINE =====
RMSE: 7021.2697
MAE : 3281.2726
R²  : -0.0008


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

# Load data
df = pd.read_csv("../data/cleaned/final_merged_dataset.csv")

target = "Cases"

# Train-test split
train_df = df[df["Year"] <= 2020].copy()
test_df  = df[df["Year"] > 2020].copy()

# Mean cases per state (all crimes combined)
state_mean = (
    train_df.groupby("State")[target]
    .mean()
    .reset_index()
    .rename(columns={target: "state_mean"})
)

# Merge with test data
test_pred_df = test_df.merge(state_mean, on="State", how="left")

# Backup: if some state missing in train (rare)
global_mean = train_df[target].mean()
test_pred_df["state_mean"].fillna(global_mean, inplace=True)

# Predictions
y_test = test_pred_df[target]
y_pred = test_pred_df["state_mean"]


mse  = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print("===== STATE-WISE MEAN BASELINE =====")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")

===== STATE-WISE MEAN BASELINE =====
RMSE: 6635.9308
MAE : 2930.7435
R²  : 0.1061
