---
title: "Practice Activity 7.1: Cross-Validation and Tuning"
format: 
  html:
    embed-resources: true
execute:
  echo: true
code-fold: true
author: James Compagno
jupyter: python3
---

In [16]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer

In [17]:
ames = pd.read_csv("AmesHousing.csv")

In [18]:
# Load data and prepare train/test spli
X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
# Model Library 
model_library = {}
records = []

# Practice Activity

Once again consider four modeling options for house price:

    -Using only the size and number of rooms.
    -Using size, number of rooms, and building type.
    -Using size and building type, and their interaction.
    -Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
    
Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

# Using only the size and number of rooms.

In [20]:
preprocess = ColumnTransformer(
    [
        ("num", "passthrough", ["Gr Liv Area", "TotRms AbvGrd"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

lr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("linear_regression", LinearRegression())
])

lr_fitted = lr_pipeline.fit(X_train, y_train)
model_library["Size_RoomNums"] = lr_fitted 

In [21]:
y_test_pred = model_library["Size_RoomNums"].predict(X_test)
mse = mean_squared_error(y_test, y_test_pred) 
cross_score = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
records.append({
    "Model": "Size_RoomNums", "Split": "Test",
    "RMSE": np.sqrt(mse), "MSE": mse, "R2": r2_score(y_test, y_test_pred), "Cross_Val_Mean": cross_score.mean()
})

cumulative_models = (pd.DataFrame(records))
cumulative_models

Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean
0,Size_RoomNums,Test,57246.982094,3277217000.0,0.447958,0.504209


## Using size, number of rooms, and building type.

In [22]:
preprocess = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("num", "passthrough", ["Gr Liv Area", "TotRms AbvGrd"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

lr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("linear_regression", LinearRegression())
])

lr_fitted = lr_pipeline.fit(X_train, y_train)
model_library["LivArea_Rooms_BlgdType"] = lr_fitted 

In [23]:
y_test_pred = model_library["LivArea_Rooms_BlgdType"].predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
cross_score = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
records.append({
    "Model": "LivArea_Rooms_BlgdType", "Split": "Test",
    "RMSE": np.sqrt(mse), "MSE": mse, "R2": r2_score(y_test, y_test_pred), "Cross_Val_Mean": cross_score.mean()  
})

cumulative_models = (pd.DataFrame(records))
cumulative_models

Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean
0,Size_RoomNums,Test,57246.982094,3277217000.0,0.447958,0.504209
1,LivArea_Rooms_BlgdType,Test,55847.506227,3118944000.0,0.474618,0.532882


## Using size and building type, and their interaction.

In [24]:
preprocess = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("num", "passthrough", ["Gr Liv Area"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

lr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False)),
    ("linear_regression", LinearRegression())
])

lr_fitted = lr_pipeline.fit(X_train, y_train)
model_library["Size_Type_IntST"] = lr_fitted

In [25]:
y_test_pred = model_library["Size_Type_IntST"].predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
cross_score = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
records.append({
    "Model": "Size_Type_IntST", "Split": "Test",
    "RMSE": np.sqrt(mse), "MSE": mse, "R2": r2_score(y_test, y_test_pred), "Cross_Val_Mean": cross_score.mean()
})

cumulative_models = pd.DataFrame(records)
cumulative_models

Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean
0,Size_RoomNums,Test,57246.982094,3277217000.0,0.447958,0.504209
1,LivArea_Rooms_BlgdType,Test,55847.506227,3118944000.0,0.474618,0.532882
2,Size_Type_IntST,Test,54824.234795,3005697000.0,0.493695,0.544867


## Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [26]:
preprocess = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False), ["TotRms AbvGrd"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

lr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("linear_regression", LinearRegression())
])

lr_fitted = lr_pipeline.fit(X_train, y_train)
model_library["Poly5_Size_Rooms_BlgdType"] = lr_fitted

In [27]:
y_test_pred = model_library["Poly5_Size_Rooms_BlgdType"].predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
cross_score = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
records.append({
    "Model": "Poly5_Size_Rooms_BlgdType", "Split": "Test",
    "RMSE": np.sqrt(mse), "MSE": mse, "R2": r2_score(y_test, y_test_pred), "Cross_Val_Mean": cross_score.mean()
})

cumulative_models = pd.DataFrame(records)
cumulative_models

Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean
0,Size_RoomNums,Test,57246.982094,3277217000.0,0.447958,0.504209
1,LivArea_Rooms_BlgdType,Test,55847.506227,3118944000.0,0.474618,0.532882
2,Size_Type_IntST,Test,54824.234795,3005697000.0,0.493695,0.544867
3,Poly5_Size_Rooms_BlgdType,Test,57120.790622,3262785000.0,0.450389,0.49714


Size_Type_IntST is the best across all metrics but only just so

# 13.3.3 Practice Activity

Consider one hundred modeling options for house price:

    -House size, trying degrees 1 through 10
    -Number of rooms, trying degrees 1 through 10
    -Building Type
    
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

## House size, trying degrees 1 through 10

In [37]:
preprocess = ColumnTransformer(
    [
        ("num", "passthrough", ["Gr Liv Area", "TotRms AbvGrd"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

Cross_Val_Test = Pipeline([
    ("preprocess", preprocess),
    ("linear_regression", LinearRegression())
])

model_library["Cross_Val_Test"] = Cross_Val_Test.fit(X_train, y_train)

In [None]:
mse = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='neg_mean_squared_error')
cross_score = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='r2')
r2 = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='r2')
rmse = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='neg_root_mean_squared_error')
records.append({
    "Model": "Cross_Val_Test", "Split": "CV-5",
    "RMSE": -1*rmse.mean(), "MSE": -1*mse.mean(), "R2": r2.mean()
})

cumulative_models = (pd.DataFrame(records))
cumulative_models

Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean,Cross_Val_R2_Mean
0,Size_RoomNums,Test,57246.982094,3277216958.833713,0.447958,0.504209,
1,LivArea_Rooms_BlgdType,Test,55847.506227,3118943951.739771,0.474618,0.532882,
2,Size_Type_IntST,Test,54824.234795,3005696720.835666,0.493695,0.544867,
3,Poly5_Size_Rooms_BlgdType,Test,57120.790622,3262784721.332296,0.450389,0.49714,
4,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]","[-3795549992.67126, -2930471185.770534, -34789...",0.504209,,0.504209
5,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]",-3136138908.170903,0.504209,,0.504209
6,Cross_Val_Test,Test,"[61608.03513074622, 54133.82663151141, 58982.4...",3136138908.170903,0.504209,,0.504209
7,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]",3136138908.170903,0.504209,,0.504209
8,Cross_Val_Test,Test,"[-61608.03513074622, -54133.82663151141, -5898...",3136138908.170903,0.504209,,0.504209
9,Cross_Val_Test,Test,55806.326349,3136138908.170903,0.504209,,0.504209


In [43]:
# Preprocessing
preprocess = ColumnTransformer(
    [("num", "passthrough", ["Gr Liv Area", "TotRms AbvGrd"])],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

# Cross Validation
Cross_Val_Test = Pipeline([
    ("preprocess", preprocess),
    ("linear_regression", LinearRegression())
])

# Add to Library
model_library["Cross_Val_Test"] = Cross_Val_Test.fit(X, y)

# Meterics Calculation 
rmse = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='neg_root_mean_squared_error')
mse = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='neg_mean_squared_error')
r2 = cross_val_score(Cross_Val_Test, X, y, cv=5, scoring='r2')

# Metrics Storage 
records.append({
    "Model": "Cross_Val_Test",
    "Split": "CV-5",
    "RMSE Mean": -rmse.mean(),
    "MSE Mean": -mse.mean(),
    "R2 Mean": r2.mean()
})

# Display
cumulative_models = (pd.DataFrame(records))
cumulative_models


Unnamed: 0,Model,Split,RMSE,MSE,R2,Cross_Val_Mean,Cross_Val_R2_Mean,RMSE Mean,MSE Mean,R2 Mean
0,Size_RoomNums,Test,57246.982094,3277216958.833713,0.447958,0.504209,,,,
1,LivArea_Rooms_BlgdType,Test,55847.506227,3118943951.739771,0.474618,0.532882,,,,
2,Size_Type_IntST,Test,54824.234795,3005696720.835666,0.493695,0.544867,,,,
3,Poly5_Size_Rooms_BlgdType,Test,57120.790622,3262784721.332296,0.450389,0.49714,,,,
4,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]","[-3795549992.67126, -2930471185.770534, -34789...",0.504209,,0.504209,,,
5,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]",-3136138908.170903,0.504209,,0.504209,,,
6,Cross_Val_Test,Test,"[61608.03513074622, 54133.82663151141, 58982.4...",3136138908.170903,0.504209,,0.504209,,,
7,Cross_Val_Test,Test,"[nan, nan, nan, nan, nan]",3136138908.170903,0.504209,,0.504209,,,
8,Cross_Val_Test,Test,"[-61608.03513074622, -54133.82663151141, -5898...",3136138908.170903,0.504209,,0.504209,,,
9,Cross_Val_Test,Test,55806.326349,3136138908.170903,0.504209,,0.504209,,,


## Number of rooms, trying degrees 1 through 10

## Building Type