## Chapter 13 Practice Activities (7.1)

In [59]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

In [60]:
ames = pd.read_csv(r"C:\Users\broga\OneDrive\Desktop\MSBA\Computing_ML\In_class\Week_7\AmesHousing.csv")

housing = ames.rename(columns={'Gr Liv Area':'Size',
                               'TotRms AbvGrd':'Number of Rooms',
                               'Bldg Type': 'Building Type'}
                               )

## PA 1 

In [61]:
X = housing.drop("SalePrice", axis = 1)
y = housing["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [62]:
#Size and # of rooms
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), []),
    ("standardize", StandardScaler(), ["Size", "Number of Rooms"])
  ],
  remainder = "drop"
)


lr_pipeline1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline1.fit(X_train, y_train)

predictions = lr_pipeline1.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 53902.6433782437


In [63]:
#Size, # of rooms, Building Type
ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
  ("standardize", StandardScaler(), ["Size", "Number of Rooms"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline2.fit(X_train, y_train)

y_pred = lr_pipeline2.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 51910.46539155018


In [64]:
#Size + Building + Interaction

ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
   ("standardize", StandardScaler(), ["Size"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Number of Rooms", "dummify__Building Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipelineInt = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipelineInt.fit(X_train, y_train)

y_pred = lr_pipelineInt.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 63172.2458351562


In [65]:
#5 degree size, 5 degree # of rooms, Building Type

#Size, # of rooms, Building Type
ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
   ("polynomial", PolynomialFeatures(degree=5), ["Size","Number of Rooms"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline4 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline4.fit(X_train, y_train)

y_pred = lr_pipeline4.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 66626.42258267608


Model 2 performed the best with an RMSE of 53763.2785

## PA 2

In [66]:
X = housing.drop("SalePrice", axis = 1)
y = housing["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
from sklearn.model_selection import cross_val_score
#Size and # of rooms

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), []),
    ("standardize", StandardScaler(), ["Size", "Number of Rooms"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores1 = cross_val_score(lr_pipeline1, X, y, cv=5, scoring='neg_mean_squared_error')
scores1 = np.sqrt(-scores1)
scores1.mean()

55806.32634926364

In [68]:
#Size, # of rooms, Building Type
ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
  ("standardize", StandardScaler(), ["Size", "Number of Rooms"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scores2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='neg_mean_squared_error')
scores2 = np.sqrt(-scores2)
scores2.mean()

54168.081429193844

In [69]:
#Size + Building + Interaction

ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
   ("standardize", StandardScaler(), ["Size"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Number of Rooms", "dummify__Building Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipelineint = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scoresInt = cross_val_score(lr_pipelineInt, X, y, cv=5, scoring='neg_mean_squared_error')
scoresInt = np.sqrt(-scoresInt)
scoresInt.mean()

65218.89541567216

In [70]:
#5 degree size, 5 degree # of rooms, Building Type

#Size, # of rooms, Building Type
ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
   ("polynomial", PolynomialFeatures(degree=5), ["Size","Number of Rooms"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline4 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scores4 = cross_val_score(lr_pipeline4, X, y, cv=5, scoring='neg_mean_squared_error')
scores4 = np.sqrt(-scores4)
scores4.mean()

59864.4399358628

Model 2 performed the best again with a average RMSE of 54168.0814, and I prefer the cross_val_score method/answer as it requires less steps and provides a more accurate assessment.

## PA 3

In [71]:


ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Building Type"]),
    ("polynomial", PolynomialFeatures(), ["Size","Number of Rooms"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [72]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Building Type"]),
        ("size_poly", PolynomialFeatures(), ["Size"]),
        ("rooms_poly", PolynomialFeatures(), ["Number of Rooms"])
    ],
    remainder="drop"
)

lr_pipeline_poly = Pipeline(
    [("preprocessing", ct_poly),
     ("standardize", StandardScaler()),
     ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {
    'preprocessing__size_poly__degree': np.arange(1, 11),
    'preprocessing__rooms_poly__degree': np.arange(1, 11)
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv=5, scoring='r2')

In [73]:
gscv_fitted = gscv.fit(X, y)
# Best parameters and best score
print("Best parameters:", gscv.best_params_)
print("Best R-squared score:", gscv.best_score_)

Best parameters: {'preprocessing__rooms_poly__degree': 1, 'preprocessing__size_poly__degree': 3}
Best R-squared score: 0.558011938596334


1. The model that performed the best (R^2: 0.5580)  was the model with the 'Size' parameter to degree 3 and the "Number of Rooms" parameter to degree 1.

2. Trying all possible model options with a wide range of parameters takes a lot of computational power and can take more time when searching for the best model. You can limit the number of degrees or models to make the process more efficient.