# Evaluating Model Performance for Numerical Data

### Excercise: Comparing Model Performance
Load a dataset of choice, ameanible to linear regression analysis. You can use the data from the last workshop. Split the data into train - validation - and test. 

In [3]:
import os

# Change the current working directory
os.chdir('/Users/Sari/Documents/Code/Semester 4 workbooks')

In [6]:
import pandas as pd


df = pd.read_csv('FiveThirtyEight_president_polls_02.02.2024.csv')

# make the states into dummies - we will work with them as dummies from here on 
state_dummies = pd.get_dummies(df['state'], prefix='state', drop_first=True)
state_dummies = state_dummies.astype(float) 

# concatenate the dummy variables with the DataFrame
df = pd.concat([df, state_dummies], axis=1)
df = df.drop('state', axis=1)

### 1a: Split your dataset into three parts, namely training, validation and test. 

In [23]:
import numpy as np

train_portion = 0.8
validation_portion = 0.9


data_portions = [
    int(train_portion * len(df)),
    int(validation_portion * len(df)),
]

train_set, validate_set, test_set = np.split(df.sample(frac=1), data_portions)


  return bound(*args, **kwds)


In [26]:
import statsmodels.api as sm

# Create the design matrices - add a constant to the training and validation (and test, whilst we are at it) design matrices
train_X = train_set.drop('poll_id', axis=1)
train_y = train_set['poll_id']

train_X_full = sm.add_constant( train_X )



Ellipsis

### 1b:  Compare 3 models on their performance on the validation set

Define 3 models, one `unsaturated` which uses only the intercept; one `fully saturated` which represents the most complex model you can generate, and one in between. Fit each model to the training data, and generate predictions for each model on the test_set. 

In [None]:
from scipy import stats
from sklearn.metrics import mean_squared_error, mean_absolute_error

def fit_predict_evaluate(train_X, train_y, valid_X, valid_y, credibility_pct=95, n_sims = 10000):
    
    # Fit the model
    model = sm.OLS(train_y, train_X).fit()
    
    # Step 2: Predict new values
    new_predictions = ...
    
    # Step 3: Calculate standard errors of the predictions on the validation set
    cov_matrix = ...
    design_matrix = ...
    variances = np.diag(np.dot(np.dot(design_matrix, cov_matrix), design_matrix.T))
    std_errors_predictions = ...
    
    # Generate posterior samples
    new_predictions_array = np.array(new_predictions)[:, None]  
    std_errors_predictions_array = np.array(std_errors_predictions)[:, None]  
    
    predicted_samples = np.random.normal(loc=new_predictions_array,
                                         scale=std_errors_predictions_array,
                                         size=(..., n_sims))
    
    # Calculate estimates summaries
    monte_carlo_medians = ...
    prediction_intervals = ...
    
    # Calculate metrics
    bias = ...
    rmse = ...
    correlation = ...
    coverage = ...
    
    # Return results as a dictionary
    results = {
        'bias': bias,
        'rmse': rmse,
        'correlation': correlation,
        'coverage': coverage,
        'predictions': new_predictions,
        'prediction_intervals': prediction_intervals
    }

    return results

In [None]:
# calculate scores for each model - this is an example, you don't have to use the exact same variables
score_intercept_only = fit_predict_evaluate(train_X_full[['const']], train_y, valid_X_full[['const']], valid_y, credibility_pct=95) 
score_national = fit_predict_evaluate(train_X_full[['const','term2', 'real_gdp_pct_growth', 'net_approval']], train_y, valid_X_full[['const','term2', 'real_gdp_pct_growth', 'net_approval']], valid_y, credibility_pct=95) 
score_saturated = fit_predict_evaluate(train_X_full, train_y, valid_X_full, valid_y, credibility_pct=95) 

In [None]:
# generate a table with the scores 
scores = [
    {'name': 'Intercept Only', **score_intercept_only},
    {'name': 'National', **score_national},
    {'name': 'Saturated', **score_saturated},
]

# Convert the list of dictionaries to a DataFrame
scores_df_automated = pd.DataFrame(scores)

# Rename the 'name' column to 'Score Type' for clarity
scores_df_automated.rename(columns={'name': 'Score Type'}, inplace=True)

scores_df_automated

### 1c:  Comment on the scores

Can you identify a model to move forward with ? on what grounds ? 
What do the differences in performance over the different metrics tell you about each model's strengths and weaknesses ? 

In [None]:
train_X_full

### 1d: For your best performning model, estimate the generalisation error 

Generate point-estimates and a measure of uncertainty for each error metric. Hint: the most comprehensive way to do this is to generate error distributions. You can do this by calculating each error metric on a separate set of simulated predictions from the empirical predictive posterior distribution. 

In [None]:
# Fit the model (best performing was National on the RMSE metric)
model = sm.OLS(train_y, train_X_full[['const','term2', 'real_gdp_pct_growth', 'net_approval']]).fit()
    
# Step 2: Predict new values
new_predictions = ...
    
# Step 3: Calculate standard errors of the predictions
cov_matrix = ...
design_matrix = ...
variances = ...
std_errors_predictions = ...
    
# Generate posterior samples
new_predictions_array = ...
std_errors_predictions_array = ...
    
predicted_samples = np.random.normal(loc=...,
                                     scale=...,
                                     size=(..., ...)) 

In [None]:
# Calculate metrics
bias_array = ...
rmse_array = ...
correlation_array = ...
# Note: coverage doesn't have a distribution, because it is already a measure which is averaged over simulations. So it is omitted here - we know it from the table above. 
# Combine metrics into a single array where each metric is a column
metrics = {
    'Bias': bias_array,
    'RMSE': rmse_array,
    'Correlation': correlation_array
}


In [None]:
# plot the distributions of each error metric
import matplotlib.pyplot as plt

# Define metric names
metric_names = ['Bias', 'RMSE', 'Correlation']

# Define number of metrics to plot
n_metrics = len(metric_names)

# Adjust rows and columns calculation to avoid empty subplots
n_plots = n_metrics  # Total number of plots needed
n_rows = 1
n_cols = (n_plots + n_rows - 1) // n_rows  # Calculate columns needed, ensuring we have enough space

# Adjust the figsize if necessary
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 5), squeeze=False)  # Make sure axes is always a 2D array

# Flatten axes array for easier iteration
axes_flat = axes.flatten()

for i, name in enumerate(metric_names):
    data = metrics[name]  # Access data directly from dictionary
    median_val = np.median(data)
    confidence_interval = np.percentile(data, [2.5, 97.5])
    
    ax = axes_flat[i]
    ax.hist(data, bins=50, color='skyblue', edgecolor='gray')
    ax.axvline(x=median_val, color='green', linestyle='-', label=f'Median: {median_val:.2f}')
    ax.axvline(x=confidence_interval[0], color='orange', linestyle='--', label=f'95% CI: [{confidence_interval[0]:.2f}, {confidence_interval[1]:.2f}]')
    ax.axvline(x=confidence_interval[1], color='orange', linestyle='--')
    ax.axvline(x=0, color='red', linestyle='--', label='Zero line')  # Highlight 0 with a line
    ax.set_title(f'Histogram of {name}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    
    # Ensure plot encompasses 0 by adjusting xlim if necessary
    xlims = ax.get_xlim()
    ax.set_xlim(min(xlims[0], 0), max(xlims[1], 0))

# Hide any unused subplots
for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axes_flat[j])

# Adjust layout for better spacing and display the plot
plt.tight_layout()
plt.show()