In [None]:
#Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from scipy import stats
import xgboost as xgb

In [None]:
#Checking training set
train  = pd.read_csv('../input/house-price-feature-eng/train.csv')
pd.set_option('display.max_rows', train.shape[0]+1)
train.head().T

In [None]:
#Checking missing values
missingvals = train.columns[train.isnull().sum()>0]
mean_missing = train[missingvals].isnull().mean().sort_values()
mean_missing

In [None]:
#To make sure there are no missing values even in the test set, we define categories of all columns according to the value to replace missing values. 
#For columns where values might be missing due to non-existing features such as no basement, None will be used to fill missing values
# For other categorical folumns, Unknown will be used
# numerical columns will be filled with either 0 or the mean value of the columns, according to what seems reasonable

missing_none = ['Condition1','Condition2','CentralAir','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','MasVnrType','PoolQC','Fence','Exterior2nd','Heating','HeatingQC','KitchenQual','PavedDrive']
missing_zero = ['MasVnrArea','BsmtFinSF2','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','LowQualFinSF','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']
missing_mean = ['LotArea','1stFlrSF', '2ndFlrSF', 'GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','TotRmsAbvGrd','MoSold','YrSold','YearBuilt','YearRemodAdd','BedroomAbvGr','KitchenAbvGr']
missing_unknown = [col for col in train.columns if col not in missing_none and col not in missing_zero and col not in missing_mean and col != 'SalePrice']

In [None]:
# Two specific columns with missing values in the training set will be filled with a specific value:
# For Garage year built, we decided to use the building's year built as a fill value
# For LotFrontage (road connected to the property), we used the mean of the neighborhood as a fill value

def replace_missingvals(df):
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
    df['LotFrontage'] = df['LotFrontage'].fillna(df.groupby('Neighborhood')['LotFrontage'].transform('mean'))
    df[missing_none] = df[missing_none].fillna('None')
    df[missing_unknown] = df[missing_unknown].fillna('Unknown')
    df[missing_zero] = df[missing_zero].fillna(0)
    df[missing_mean] = df[missing_mean].fillna(df[missing_mean].mean().round())
    return df

In [None]:
#Replace missing values in the train set and make sure there are no missing values left
train = replace_missingvals(train)
train.columns[train.isna().sum()>0]

In [None]:
#For ordinal columns, we wanted to make sure the meaning is preserved after encoding,s owe saved all the categories in the correct order
ordcols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
       'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC','LotShape','LandSlope',
           'BsmtExposure','BsmtFinType1',
       'BsmtFinType2', 'Functional','GarageFinish',]
ratingcols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
       'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
orddict = {
    'LotShape':['IR3','IR2','None','IR1','Reg'],
    'LandSlope':['Sev','Mod','Unknown','Gtl'],
     'BsmtExposure': ['No','Mn','None','Av','Gd'],
    'BsmtFinType1': ['Unf','LwQ','Rec','None','BLQ','ALQ','GLQ'],
    'BsmtFinType2': ['Unf','LwQ','Rec','None','BLQ','ALQ','GLQ'],
    'Functional':['Sal','Sev','Maj2','Maj1','Unknown','Mod','Min2','Min1','Typ'],
    'GarageFinish':['Unf','RFn','None','Fin'],  
}

In [None]:
#Encoding ordinal columns
def ordinal_encode(df):
    enc = OrdinalEncoder(categories = [['Po','Fa','None','TA','Gd','Ex']])
    for col in ratingcols:
        df[col] = enc.fit_transform(df[col].to_numpy().reshape(-1,1))
    for col in orddict:
        enc = OrdinalEncoder(categories = [orddict[col]])
        df[col] = enc.fit_transform(df[col].to_numpy().reshape(-1,1))
    return df

In [None]:
#Encoding ordinal columns in the train set
train = ordinal_encode(train)
train.head()

In [None]:
#Making sure the columns are encoded correctly
train[ordcols].head()

In [None]:
#Saving remaining categorical features to a variable
objectcols = train.select_dtypes('object').columns
objectcols

In [None]:
numcols = [col for col in train.columns if col not in objectcols and col not in ordcols]
train[numcols].hist(figsize = (15,10),xlabelsize=0,ylabelsize=0,bins=100)

It seems like some of the columns have skewed distributions, so we will apply a log transformation to relevant columns. 

In [None]:
#Function that transforms columns to log of columns
cols_to_log = ['MSSubClass','LotFrontage', 'LotArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','GrLivArea',
     'OpenPorchSF']

def log_cols(df):
    df.loc[:,cols_to_log] = np.log(df[cols_to_log].mask(df[cols_to_log] <=0)).fillna(0)
    return df

In [None]:
#Apply log function to relevant columns and apply the result. Target column is transformed outside the function so the function can be used for the test set as well.
train = log_cols(train)
train['SalePrice'] = np.log(train['SalePrice'])
train[numcols].hist(figsize = (15,10),xlabelsize=0,ylabelsize=0,bins=100)

After the transformation, the right skewed distributions are more similar to normal distributions. 

In [None]:
#Encoding categorical features with one hot encoder and dropping the original columns
def encode_categorical(df):
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(df[objectcols]).toarray()
    #Create a Pandas DataFrame of the hot encoded column
    feature_names = ohe.get_feature_names(objectcols)
    df = pd.concat([df.select_dtypes(exclude='object'), pd.DataFrame(transformed,columns=feature_names).astype(int)], axis=1)
    df.drop(ordcols,axis=1)
    return df

In [None]:
#encoding categorical columns in the train set
train_enc = encode_categorical(train)

In [None]:
#Remove outliers by checking Z-values of relevant clumns with a distribution that might present outliers
statcols = ['MasVnrArea','LotArea','2ndFlrSF','GrLivArea','LotFrontage','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','2ndFlrSF','GarageArea','OpenPorchSF','SalePrice']
abs_z_scores = np.abs(stats.zscore(train_enc[statcols]))
filtered_entries = (abs_z_scores < 3.5).all(axis=1)
train_enc = train_enc[filtered_entries]
train_enc = train_enc.set_index('Id')
train_enc.shape

In [None]:
#Split the train set into X (features) and y (target)
X_train = train_enc.drop('SalePrice',axis=1)
y_train = train_enc['SalePrice']

In [None]:
#Check results of using linear regression with cross validation in the train set
reg = LinearRegression()
reg.fit(X_train, y_train)
scores = cross_val_score(reg, X_train, y_train, cv=5)
scores.mean()

In [None]:
#Check random forest results with cross validation in the train set
forest = RandomForestRegressor()
scores = cross_val_score(forest, X_train, y_train, cv=5)
scores.mean()

It seems like linear regression shows better results than Random Forest. We will continue to explore other models after some feature selection and feature engineering. 

In [None]:
#Check feature importances when training initial random forest model
forest.fit(X_train,y_train)
importances = {'features': X_train.columns, 'importance': forest.feature_importances_}
imp_df = pd.DataFrame(importances).sort_values(by='importance',ascending=False)
imp_df.sort_values(by="importance").plot(x='features', y='importance', kind="barh",figsize = (10,40))

We can see that some of the top features contribute greatly to the prediction, while most of the fatures have a very minimal importance. 

In [None]:
#Feature selection: Once the features are ordered by importance, we check what the best cutoff is for the seleced amount of features
mean_scores = []
feature_nums = np.arange(20,200,10)
for feature_num in feature_nums:
    top_features = list(imp_df['features'][:feature_num])
    X_train_mod = X_train[top_features]
    reg = RandomForestRegressor()
    scores = cross_val_score(reg, X_train_mod, y_train, cv=5)
    mean_scores.append(scores.mean())
    
plt.plot(feature_nums,mean_scores)

It seems like the number of selected features only has little impact onthe score, however, 80 features seem to show high performance, so this is the cutoff we selected

In [None]:
#Keep only top 80 features
X_train_mod = X_train[list(imp_df['features'][:80])]

In [None]:
#Feature engineering: We used some of the most important features in various combinations and transformations to augment the data
def add_features(df):
    df['Qualcond'] = df['OverallQual']*df['OverallCond']
    df['Neighborhoodqual'] = train.groupby('Neighborhood')['OverallQual'].transform('mean')
    df['Totalarea'] = df['GrLivArea']+df['LotArea']
    df['HasGarage'] = np.sign(df['GarageArea'])
    df['HasBasement'] = np.sign(df['TotalBsmtSF'])
    df['HasFireplace'] = np.sign(df['Fireplaces'])
    df['age'] = df['YrSold']-df['YearBuilt']
    return df

In [None]:
#Adding columns based on the training set before encoding, so that we can group by neighborhood
#Then joining the added columns back to the current X_train_mod dataframe
Newcols = ['Id','Qualcond','Neighborhoodqual','Totalarea','HasGarage','HasBasement','HasFireplace','age']
added_cols = add_features(train)[Newcols]
added_colnames = added_cols.columns
added_cols['Id'] = added_cols['Id'].astype('float')
X_train_mod = X_train_mod.join(added_cols.set_index('Id') ,how='left')
X_train_mod.columns

In [None]:
#Checking performance improvement after adding features
forest = RandomForestRegressor()
scores = cross_val_score(reg, X_train_mod, y_train, cv=5)
scores.mean()

We can see an improvement in the Random Forest results.

In [None]:
#Checking updated feature importance
forest.fit(X_train_mod,y_train)
mod_importances = {'features': X_train_mod.columns, 'importance': forest.feature_importances_}
mod_imp_df = pd.DataFrame(mod_importances).sort_values(by='importance',ascending=False)
mod_imp_df.sort_values(by="importance").plot(x='features', y='importance', kind="barh",figsize = (10,20))

It seems like some of the added features have high importance

In [None]:
#checking XGboost performance
Xgreg = xgb.XGBRegressor()
scores = cross_val_score(Xgreg, X_train_mod, y_train, cv=5)
scores.mean()

In [None]:
#Checking linear regression improvement after modifying dataset
reg = LinearRegression()
reg.fit(X_train_mod, y_train)
scores = cross_val_score(reg, X_train_mod, y_train, cv=5)
scores.mean()

Linear regression still shows better results than Random Forest and even XGBoost.

In [None]:
#Finding best random forest parameters with gridsearch
parameters = {'n_estimators':np.arange(100,1000,100), 'max_depth':np.arange(10,50,10)}
forest = RandomForestRegressor()
reg = GridSearchCV(forest, parameters)
reg.fit(X_train_mod,y_train)

In [None]:
#Checking score improvement for the best estimator
param_forest = reg.best_estimator_
print(param_forest)
scores = cross_val_score(param_forest, X_train_mod, y_train, cv=5)
scores.mean()

It seems like hyperparameter tuning showed some improvement for Random Forest, but not significantly. 
For XGBoost, we tried improving some hyperparameters individually, but it seemed like the best result was achieved by the default parameters, and running grid search would take a very long time, so we kept the default for our stacked model.
We decided to use a stacked model for our final prediction, which consists of the tuned Random Forest, Linear regression and XGBoost. As the final estimator for the stacked model, we used XGBoost. We also tried using Linear regression as the final model, which showed great cross validation results but ran into memory issues during prediction. 

In [None]:
estimators = [('random_forest', RandomForestRegressor(n_estimators=500,max_depth=40)),('linear', LinearRegression()), ('xgb',xgb.XGBRegressor())]
stack = StackingRegressor(estimators=estimators,final_estimator=xgb.XGBRegressor())
scores = cross_val_score(stack, X_train_mod, y_train, cv=5)
scores.mean()

In [None]:
#Funcion that wraps all the preprocessing steps together
def preprocess(df):
    df = replace_missingvals(df)
    df = ordinal_encode(df)
    df = add_features(df)
    df = log_cols(df)
    df = encode_categorical(df)
    return df

In [None]:
#Preprocess the test set
test = pd.read_csv('../input/house-price-feature-eng/test.csv')
test = preprocess(test)

In [None]:
#Keep only columns present in both the train and test set (differences are caused by ordinal encoding as well as feature selection)
combined = pd.concat([X_train_mod, test], join="inner")
X_train_matched = combined[:len(X_train_mod)]
test_matched = combined[len(X_train_mod):]
X_train_matched.shape, test_matched.shape

In [None]:
#Fit the stack regressor on the train set  with only relevant columns, then create prediciton on the test set
stack.fit(X_train_matched,y_train)
pred = stack.predict(test_matched)
pred[pred!=0] = np.exp(pred[pred!=0])
pred_df = pd.DataFrame({'Id': test.Id, 'Predicted': pred})
pred_df.head()