In [126]:
#import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
#from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest

In [127]:
#import data

df = pd.read_csv('./data/Housing Prices Competition/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [135]:
#assign X and y

y = df.SalePrice
X = df.drop(['SalePrice'],axis=1)

In [136]:
#drop features
drop_cols = ["Alley", "PoolQC", "Fence", "MiscFeature", "Id"] 
X = X.drop(columns=drop_cols)

In [137]:
#train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [138]:
numeric = X_train.select_dtypes(include=[np.number]).columns
categoric = X_train.select_dtypes(include=[np.number]).columns

In [160]:
numeric_PL = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler()),
    ("select", SelectKBest(k=10))])
categoric_PL = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_PL, numeric),
        ("categoric", categoric_PL, categoric)])

In [161]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge(alpha=1.0))
])

In [162]:
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
train_mae = mean_absolute_error(y_train, y_pred)
y_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Train_MAE: ", train_mae)
print(f"Test_MAE: ", test_mae)

#score = model.score(X_test, y_test)
#print(f"score: ", score)

Train_MAE:  2012.3890647840258
Test_MAE:  20585.711117267412
