In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
ames = pd.read_csv("/Users/richlegendary/Desktop/GSB-S544/AmesHousing.csv")

In [4]:
lr = LinearRegression()
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_


array([ 70312.81696396, -15726.68229575])

In [5]:
lr_pipeline = Pipeline(
  [StandardScaler(),
  LinearRegression()]
)

lr_pipeline

In [6]:
lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [7]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

Practice Activity

Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

1. Using only the size and number of rooms.

In [8]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)
lr_fitted = lr_pipeline_1.fit(X_train, y_train)

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
y_pred_1 = lr_fitted.predict(X_test)
mse1 = mean_squared_error(y_test, y_pred_1)
mse1

3412844753.55724

2. Using size, number of rooms, and building type.

In [13]:
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_fitted = lr_pipeline_2.fit(X_train, y_train)

y_pred_2 = lr_fitted.predict(X_test)
mse2 = mean_squared_error(y_test, y_pred_2)
mse2

3201532997.1364255

3. Using size and building type, and their interaction.

In [18]:
X["Bldg Type"].unique()

array(['1Fam', 'TwnhsE', 'Twnhs', 'Duplex', '2fmCon'], dtype=object)

we just need to include four of the five building type in our model, because one of them will be the reference level.

In [19]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_1Fam","dummify__Bldg Type_TwnhsE","dummify__Bldg Type_Twnhs","dummify__Bldg Type_Duplex"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified
ct_inter.fit_transform(X_train_dummified)

Unnamed: 0,interaction__1,interaction__remainder__Gr Liv Area,interaction__dummify__Bldg Type_1Fam,interaction__dummify__Bldg Type_TwnhsE,interaction__dummify__Bldg Type_Twnhs,interaction__dummify__Bldg Type_Duplex,interaction__remainder__Gr Liv Area dummify__Bldg Type_1Fam,interaction__remainder__Gr Liv Area dummify__Bldg Type_TwnhsE,interaction__remainder__Gr Liv Area dummify__Bldg Type_Twnhs,interaction__remainder__Gr Liv Area dummify__Bldg Type_Duplex,interaction__dummify__Bldg Type_1Fam dummify__Bldg Type_TwnhsE,interaction__dummify__Bldg Type_1Fam dummify__Bldg Type_Twnhs,interaction__dummify__Bldg Type_1Fam dummify__Bldg Type_Duplex,interaction__dummify__Bldg Type_TwnhsE dummify__Bldg Type_Twnhs,interaction__dummify__Bldg Type_TwnhsE dummify__Bldg Type_Duplex,interaction__dummify__Bldg Type_Twnhs dummify__Bldg Type_Duplex
1161,1.0,1245.0,0.0,1.0,0.0,0.0,0.0,1245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2773,1.0,1525.0,1.0,0.0,0.0,0.0,1525.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1834,1.0,1378.0,0.0,1.0,0.0,0.0,0.0,1378.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2100,1.0,1494.0,1.0,0.0,0.0,0.0,1494.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
552,1.0,841.0,1.0,0.0,0.0,0.0,841.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665,1.0,1422.0,1.0,0.0,0.0,0.0,1422.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1648,1.0,1930.0,1.0,0.0,0.0,0.0,1930.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
700,1.0,1701.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
894,1.0,876.0,1.0,0.0,0.0,0.0,876.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
lr_pipeline_3 = Pipeline(
  [("dummies", ct_dummies),
   ("preprocessing", ct_inter),
  ("linear_regression", LinearRegression())]
)

lr_fitted = lr_pipeline_3.fit(X_train, y_train)


y_pred_3 = lr_fitted.predict(X_test)
mse3 = mean_squared_error(y_test, y_pred_3)
mse3

3078821771.794034

4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type. 

In [22]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

In [23]:
ct_5degree = ColumnTransformer(
    [('5 degree', PolynomialFeatures(degree = 5, include_bias = False), ['remainder__Gr Liv Area','remainder__TotRms AbvGrd']),
    ],
     remainder = 'drop').set_output(transform = "pandas")

In [24]:
lr_pipeline_4 = Pipeline(
  [("dummies", ct_dummies),
   ("polynomial", ct_5degree),
  ("linear_regression", LinearRegression())]
)


lr_fitted = lr_pipeline_4.fit(X_train, y_train)


y_pred_4 = lr_fitted.predict(X_test)
mse4 = mean_squared_error(y_test, y_pred_4)
mse4

6695077577.554809

cv score

In [50]:
from sklearn.model_selection import cross_val_score

In [60]:
scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='neg_mean_squared_error')
mse1 = abs(scores.mean())

In [65]:
scores = cross_val_score(lr_pipeline_2, X, y, cv=5, scoring='neg_mean_squared_error')
mse2 = abs(scores.mean())
mse2

2950901126.9849825

In [62]:
scores = cross_val_score(lr_pipeline_3, X, y, cv=5, scoring='neg_mean_squared_error')
mse3 = abs(scores.mean())

In [63]:
scores = cross_val_score(lr_pipeline_4, X, y, cv=5, scoring='neg_mean_squared_error')
mse4 = abs(scores.mean())

In [64]:
min(mse1,mse2,mse3,mse4)

2950901126.9849825

The second model is the best, which is the one with size, number of rooms, and building type. This model has the smallest mse, indicating that it did the best job on prediciting Salesprice.

Consider one hundred modeling options for house price:

House size, trying degrees 1 through 10
Number of rooms, trying degrees 1 through 10
Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np

ct_poly = ColumnTransformer(
  transformers=[
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("polynomial1", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial2", PolynomialFeatures(), ["TotRms AbvGrd"]),
  ],
  remainder="drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
   ("linear_regression", LinearRegression())]
)

degrees = {
    'preprocessing__polynomial1__degree': np.arange(1, 11),  
    'preprocessing__polynomial2__degree': np.arange(1, 11)   
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv=5, scoring='r2')
gscv_fitted = gscv.fit(X, y)

result_df = pd.DataFrame({
    "Gr Liv Area Degrees": gscv_fitted.cv_results_['param_preprocessing__polynomial1__degree'],
    "TotRms AbvGrd Degrees": gscv_fitted.cv_results_['param_preprocessing__polynomial2__degree'],
    "Mean Test Score": gscv_fitted.cv_results_['mean_test_score']
})
result_df

Unnamed: 0,Gr Liv Area Degrees,TotRms AbvGrd Degrees,Mean Test Score
0,1,1,0.532882
1,1,2,0.532383
2,1,3,0.535924
3,1,4,0.541529
4,1,5,0.541066
...,...,...,...
95,10,6,-16.187893
96,10,7,-16.187893
97,10,8,-16.187893
98,10,9,-16.187856


In [38]:
max_index = result_df['Mean Test Score'].idxmax()

row_with_max_value = result_df.loc[max_index]

print(row_with_max_value)

Gr Liv Area Degrees             3
TotRms AbvGrd Degrees           1
Mean Test Score          0.557641
Name: 20, dtype: object


With 3 degree of Size and 1 degree on rooms, it generates the highest R2 which is 55.77%. That will be our best model.

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

The biggest downside is probably overfitting. Like trying 10th degree on size to predict Saleprice just doesn't make sense. It would also be hard to explain the model in words. 
A smaller number of tuning values will reduce this issue, but then underfitting could happen as well.