In [5]:
import pandas as pd
data = pd.read_csv("AmesHousing.csv")
data.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [15]:

X = data.drop("SalePrice", axis = 1)
y = data["SalePrice"]



In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
abs(scores.mean())


X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)

In [22]:
pipeline1 = Pipeline([
    ("preprocessing", ColumnTransformer([("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])])),
    ("linear_regression", LinearRegression())
])

score = cross_val_score(pipeline1, X, y, cv=5, scoring='neg_mean_squared_error')
abs(score.mean())

3136138908.1709027

In [23]:
ct2 = ColumnTransformer(
    transformers=[
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummy", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
)

lr_pipeline2 = Pipeline([
    ("preprocessing", ct2),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

scores2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring="neg_mean_squared_error")
mse2 = abs(scores2.mean())
print("Model 2 Mean Squared Error:", mse2)


Model 2 Mean Squared Error: 2951993958.1007304


In [24]:
from sklearn.preprocessing import PolynomialFeatures

ct3 = ColumnTransformer(
    transformers=[
        ("standardize", StandardScaler(), ["Gr Liv Area"]),
        ("dummy", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
)

lr_pipeline3 = Pipeline([
    ("preprocessing", ct3),
    ("polynomial_features", PolynomialFeatures(interaction_only=True)),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

scores3 = cross_val_score(lr_pipeline3, X, y, cv=5, scoring="neg_mean_squared_error")
mse3 = abs(scores3.mean())
print("Model 3 Mean Squared Error:", mse3)


Model 3 Mean Squared Error: 2871227973.4671288


In [25]:
ct4 = ColumnTransformer(
    transformers=[
        ("poly", PolynomialFeatures(degree=5), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummy", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
)

lr_pipeline4 = Pipeline([
    ("preprocessing", ct4),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

scores4 = cross_val_score(lr_pipeline4, X, y, cv=5, scoring="neg_mean_squared_error")
mse4 = abs(scores4.mean())
print("Model 4 Mean Squared Error:", mse4)


Model 4 Mean Squared Error: 3679510052.019084


In [26]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [27]:
from sklearn.model_selection import GridSearchCV

ct = ColumnTransformer(
    transformers=[
        ("size_poly", PolynomialFeatures(), ["Gr Liv Area"]),
        ("rooms_poly", PolynomialFeatures(), ["TotRms AbvGrd"]),
        ("building_type", OneHotEncoder(), ["Bldg Type"])
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])

param_grid = {
    "preprocessing__size_poly__degree": np.arange(1, 11),
    "preprocessing__rooms_poly__degree": np.arange(1, 11)
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_squared_error")


In [29]:
grid_search.fit(X, y)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Mean Squared Error:", best_score)


Best Parameters: {'preprocessing__rooms_poly__degree': 1, 'preprocessing__size_poly__degree': 3}
Best Mean Squared Error: 2798020957.7279315
