In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

ames = pd.read_csv('/Users/rileysallander/Desktop/ML/ML554/Data/AmesHousing.csv')
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)

lr_pipeline = Pipeline([
  ("linear_regression", LinearRegression())]
)

lr_pipeline

mod1=lr_pipeline.fit(X_train,y_train)



In [13]:


X2 = ames[["Gr Liv Area", "TotRms AbvGrd","Bldg Type"]]
y2 = ames["SalePrice"]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, test_size=0.25)



ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")


ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")



lr_pipeline2 = Pipeline([("preprocessing",ct),("linear_regression", LinearRegression())])

mod2 = lr_pipeline2.fit(X2_train,y2_train)




In [8]:
X3 = ames[["Gr Liv Area", "Bldg Type"]]
y3 = ames["SalePrice"]

X3_train, X3_test, y3_train, y3_test = train_test_split(X3,y3, test_size=0.25)


ct2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")


ct_inter = ColumnTransformer(
  [
    ("interaction 1fm", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction 2fm", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction dup", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction twnhE", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
    ("interaction twnhs", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lr_pipeline3 = Pipeline([("preprocessing",ct2),("interact",ct_inter),("linear_regression", LinearRegression())])

mod3 = lr_pipeline3.fit(X3_train,y3_train)


In [9]:



X2 = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y2 = ames["SalePrice"]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25)


ct = ColumnTransformer(
    [
        ("poly_area", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False), ["TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="passthrough"
).set_output(transform="pandas")

# Define the pipeline with preprocessing and linear regression
lr_pipeline4 = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])

# Fit the model
mod2 = lr_pipeline4.fit(X2_train, y2_train)


In [14]:
cv_scores = cross_val_score(lr_pipeline, X, y, 
                            cv=5, scoring="neg_mean_squared_error")

rmse_scores = np.sqrt(-cv_scores)

average_rmse = rmse_scores.mean()
print("Cross-validated RMSE for the model:", average_rmse)

cv_scores = cross_val_score(lr_pipeline2, X2, y2, 
                            cv=5, scoring="neg_mean_squared_error")

rmse_scores = np.sqrt(-cv_scores)

average_rmse = rmse_scores.mean()
print("Cross-validated RMSE for the model:", average_rmse)


cv_scores = cross_val_score(lr_pipeline3, X3, y3, 
                            cv=5, scoring="neg_mean_squared_error")

rmse_scores = np.sqrt(-cv_scores)

average_rmse = rmse_scores.mean()
print("Cross-validated RMSE for the model:", average_rmse)


cv_scores = cross_val_score(lr_pipeline4, X2, y2, 
                            cv=5, scoring="neg_mean_squared_error")

rmse_scores = np.sqrt(-cv_scores)

average_rmse = rmse_scores.mean()
print("Cross-validated RMSE for the model:", average_rmse)

Cross-validated RMSE for the model: 55806.32634926364
Cross-validated RMSE for the model: 54156.04847002992
Cross-validated RMSE for the model: 53438.15076429967
Cross-validated RMSE for the model: 55806.32634926364
Cross-validated RMSE for the model: 54156.04847002992
Cross-validated RMSE for the model: 53438.15076429967


Cross-validated RMSE for the model: 56303.4848404074
Cross-validated RMSE for the model: 56303.4848404074


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming 'ames' DataFrame is already loaded
X2 = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y2 = ames["SalePrice"]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25)

# Define column transformers with polynomial features for the interaction terms
ct_inter = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("interaction", PolynomialFeatures(interaction_only=True), ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder="passthrough"
).set_output(transform="pandas")

# Pipeline combining the column transformer and linear regression model
lr_pipeline2 = Pipeline([("preprocessing", ct_inter), ("linear_regression", LinearRegression())])

# Define the parameter grid for GridSearchCV
param_grid = {
    'preprocessing__interaction__degree': range(1, 11),  # Trying polynomial degrees from 1 to 10 for interactions
}

# Set up GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(lr_pipeline2, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X2_train, y2_train)

# Get the best model and its performance
best_model = grid_search.best_estimator_
best_rmse = np.sqrt(-grid_search.best_score_)

# Calculate and print R^2 values
y2_train_pred = best_model.predict(X2_train)
y2_test_pred = best_model.predict(X2_test)
r2_train = best_model.score(X2_train, y2_train)
r2_test = best_model.score(X2_test, y2_test)

print("Best model:", best_model)
print("Best cross-validated RMSE:", best_rmse)
print("Best model R^2 on training set:", r2_train)
print("Best model R^2 on test set:", r2_test)



Best model: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dummify',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Bldg Type']),
                                                 ('standardize',
                                                  StandardScaler(),
                                                  ['Gr Liv Area',
                                                   'TotRms AbvGrd']),
                                                 ('interaction',
                                                  PolynomialFeatures(degree=1,
                                                                     interaction_only=True),
                                                  ['Gr Liv Area',
                                                   'TotRms AbvGrd'])])),
                ('linear_regression'