In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

In [2]:
iowa_file_path = 'train.csv'
home_data = pd.read_csv(iowa_file_path, index_col='Id')

In [3]:
numeric_cols = [col for col in home_data.columns if home_data[col].dtype in ['int64', 'float64']]
X_num = home_data[numeric_cols]

data_1 = X_num.isnull().sum()
data_1 = data_1[data_1 > 0]
print(data_1)
cols_1 = data_1.index
print(cols_1)

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64
Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')


In [4]:
X_num = X_num.drop(cols_1, axis=1)
X_num = X_num.drop('SalePrice', axis=1)

In [5]:
object_cols = np.array([col for col in home_data.columns if home_data[col].dtype == "object"])
X_cat = home_data[object_cols]
data_2 = X_cat.isnull().sum()
data_2 = data_2[data_2 > 0]
print(data_2)
cols_2 = data_2.index

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [6]:
X_cat = X_cat.drop(cols_2, axis=1)

In [7]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X_cat))
OH_cols.index = X_cat.index
X = pd.concat([X_num, OH_cols], axis=1)

In [8]:
y = home_data.SalePrice

In [9]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [10]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
forest_predictions = forest_model.predict(val_X)



In [11]:
forest_mae = mean_absolute_error(val_y, forest_predictions)
print(forest_mae)

16473.276584474886


In [12]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(train_X, train_y, 
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)],
             verbose=False)
xgb_predictions = xgb_model.predict(val_X)



In [13]:
xgb_mae = mean_absolute_error(val_y, xgb_predictions)
print(xgb_mae)

# scores = -1*cross_val_score(xgb_model, X, y, cv=5, scoring='neg_mean_absolute_error')
# print(scores)
# print(np.mean(scores))

16058.633690068493


In [14]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(train_X, train_y)
lasso_predictions = lasso_model.predict(val_X)

  model = cd_fast.enet_coordinate_descent(


In [15]:
lasso_mae = mean_absolute_error(val_y, lasso_predictions)
print(lasso_mae)

18474.320378030996


In [16]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(train_X, train_y)
ridge_predictions = ridge_model.predict(val_X)



In [17]:
ridge_mae = mean_absolute_error(val_y, ridge_predictions)
print(ridge_mae)

18684.345079638726


In [18]:
from sklearn.linear_model import ElasticNet

elastic_model = ElasticNet(alpha=0.1)
elastic_model.fit(train_X, train_y)
elastic_predictions = elastic_model.predict(val_X)

  model = cd_fast.enet_coordinate_descent(


In [19]:
elastic_mae = mean_absolute_error(val_y, elastic_predictions)
print(elastic_mae)

18144.55231438312


In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

bag_xgb_model = BaggingRegressor(base_estimator=xgb.XGBRegressor())
bag_xgb_model.fit(train_X, train_y)
bag_xgb_predictions = bag_xgb_model.predict(val_X)



In [21]:
bag_xgb_mae = mean_absolute_error(val_y, bag_xgb_predictions)
print(bag_xgb_mae)

14433.110177654109


In [22]:
bag_tree_model = BaggingRegressor(base_estimator=DecisionTreeRegressor())
bag_tree_model.fit(train_X, train_y)
bag_tree_predictions = bag_tree_model.predict(val_X)

bag_tree_mae = mean_absolute_error(val_y, bag_tree_predictions)
print(bag_tree_mae)



17823.9597260274




In [23]:
bag_forest_model = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=1))
bag_forest_model.fit(train_X, train_y)
bag_forest_predictions = bag_forest_model.predict(val_X)

bag_forest_mae = mean_absolute_error(val_y, bag_forest_predictions)
print(bag_forest_mae)



16925.75950471842


