In [82]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

https://ds-ml-with-python.github.io/Course-Textbook/13-penalized_regression.html

In [83]:
# Read the data
ames = pd.read_csv("AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [84]:
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Garage Cars,Garage Area,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,2.0,528.0,P,210,62,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,1.0,730.0,Y,140,0,0,0,120,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,1.0,312.0,Y,393,36,0,0,0,0,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,2.0,522.0,Y,0,0,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,2.0,482.0,Y,212,34,0,0,0,0,0,3,2010,WD,Normal,189900


In [85]:
# Model Library 
model_library = {}
records = []

# Normal Linear 

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1) #PID is an ID we shouldnt predict based on that
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'), #If a dummy variable doesnt have a value you move on rather than throw a fir
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)


lr_pipeline_score = cross_val_score(lr_pipeline, X, y, cv = 5, scoring = 'r2')
model_library["linear_regression"] = lr_pipeline_score.mean()
model_library

In [None]:
#ADD STYLE TO THE LIBRARY of r2s 

# Ridge Practice

Make a pipeline that uses all the variables in the Ames dataset, and then fits Ridge Regression with l=1.

Cross-validate this pipeline and compare the results to the ordinary linear regression.

Then fit the model on the whole dataset and get the coefficients. Make a plot of these coefficients compared to the ones from ordinary linear regression.

In [None]:
ridge1 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=1.0))]
)

ridge1_score = cross_val_score(ridge1, X, y, cv = 5, scoring = 'r2')
ridge1_mean = ridge1_score.mean()
ridge1_mean 

Using the same pipeline as previously, perform tuning on lambda.

You should always try lambda values on a log scale; that is, donâ€™t use [1,2,3,4]; instead use something like [0.001, 0.01, 0.1, 1, 10]

In [None]:
ridge001 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=0.001))]
)

ridge0001_score = cross_val_score(ridge001, X, y, cv = 5, scoring = 'r2')
ridge0001_mean = ridge0001_score.mean()
ridge0001_mean 

In [None]:
# Alpha = 0.01
ridge001 = Pipeline(
  [("preprocessing", ct),
   ("ridge_regression", Ridge(alpha=0.01))]
)

ridge001_score = cross_val_score(ridge001, X, y, cv=5, scoring='r2')
ridge001_mean = ridge001_score.mean()
ridge001_mean

In [None]:
# Alpha = 0.1
ridge01 = Pipeline(
  [("preprocessing", ct),
   ("ridge_regression", Ridge(alpha=0.1))]
)

ridge01_score = cross_val_score(ridge01, X, y, cv=5, scoring='r2')
ridge01_mean = ridge01_score.mean()
ridge01_mean

In [None]:
# Alpha = 10
ridge10 = Pipeline(
  [("preprocessing", ct),
   ("ridge_regression", Ridge(alpha=10))]
)

ridge10_score = cross_val_score(ridge10, X, y, cv=5, scoring='r2')
ridge10_mean = ridge10_score.mean()
ridge10_mean

In [None]:
# residual_plot = (
#     p9.ggplot(df_residuals, p9.aes(x='y_pred', y='residuals'))
#     + p9.geom_point(shape='o', color='black', alpha=0.6, size=2)
#     + p9.geom_hline(yintercept=0, color='red', linetype='dashed', size=1)
#     + p9.labs(
#         title='Residual Plot: Final_Regression',
#         x='Fitted Values (Predicted Charges)',
#         y='Residuals'
#     )
#     + p9.theme_minimal()
# )

# residual_plot

NOT DONE

# LASSO Practice 

Create a LASSO pipeline, and tune lambda. [0.001, 0.01, 0.1, 1, 10]

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS 

In [None]:
# Alpha = 0.001
lasso0001 = Pipeline(
  [("preprocessing", ct),
   ("lasso_regression", Lasso(alpha=0.001))]
)

lasso0001_score = cross_val_score(lasso0001, X, y, cv=5, scoring='r2')
lasso0001_mean = lasso0001_score.mean()
lasso0001_mean


In [None]:
# Alpha = 0.01
lasso001 = Pipeline(
  [("preprocessing", ct),
   ("lasso_regression", Lasso(alpha=0.01))]
)

lasso001_score = cross_val_score(lasso001, X, y, cv=5, scoring='r2')
lasso001_mean = lasso001_score.mean()
lasso001_mean

In [None]:
# Alpha = 0.1
lasso01 = Pipeline(
  [("preprocessing", ct),
   ("lasso_regression", Lasso(alpha=0.1))]
)

lasso01_score = cross_val_score(lasso01, X, y, cv=5, scoring='r2')
lasso01_mean = lasso01_score.mean()
lasso01_mean

In [None]:
# Alpha = 1
lasso1 = Pipeline(
  [("preprocessing", ct),
   ("lasso_regression", Lasso(alpha=1))]
)

lasso1_score = cross_val_score(lasso1, X, y, cv=5, scoring='r2')
lasso1_mean = lasso1_score.mean()
lasso1_mean

In [None]:
# Alpha = 10
lasso10 = Pipeline(
  [("preprocessing", ct),
   ("lasso_regression", Lasso(alpha=10))]
)

lasso10_score = cross_val_score(lasso10, X, y, cv=5, scoring='r2')
lasso10_mean = lasso10_score.mean()
lasso10_mean

In [None]:
best_fitted = lasso10.fit(X, y)

# Elastic Net

Create an Elastic Net pipeline, and tune lambda and alpha.

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS.

Keep same labdas as above [0.001, 0.01, 0.1, 1, 10] and choose randomly for alpha (the split between the two)

In [None]:
#Grid search for the loop