# <font color='	#2E45B8'> 1. Data Preparation</font>
## <font color='	#2E45B8'>  1.1 Import libraries and data</font>

In [None]:
# Package for data science
import pandas as pd
import numpy as np

# Package for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

# Others
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data.head()

## <font color='	#2E45B8'> 1.2 Data inspection¶</font>
### <font color='	#2E45B8'> 1.2.1 Data types</font>

In [None]:
num_cols = data.select_dtypes(exclude='object').columns.tolist()
cat_cols = data.select_dtypes(include='object').columns.tolist()
print('The dataset contains:', data.dtypes.unique())
print('There are', len(num_cols), 'numerical columns')
print('There are', len(cat_cols), 'categorical features')

### <font color='	#2E45B8'> 1.2.2 Explore missing values</font>

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(data.isnull(), cbar = False, cmap="gray")

In [None]:
#pd.set_option('display.max_rows', None)
data.isnull().sum().sort_values(ascending=False)
missing = pd.DataFrame(data.isnull().sum().sort_values(ascending=False), columns=['No.of missing values'])
missing ['% missing'] = (missing/len(data)).round(3)*100
missing[missing['No.of missing values']>0]

#### Comment
- Out of 81 columns, 19 columns have missing values. 
- In particular, more than 45% of numbers are missing within `PoolQC`, `MiscFeature`, `Alley`, `Fence`, `FireplaceQu` columns
- Therefore, we will drop these features

In [None]:
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
data.drop(cols_to_drop, axis=1, inplace=True)

# <font color='	#2E45B8'> 2. EDA</font>
## <font color='	#2E45B8'>  2.1 Checking target variable</font>

In [None]:
target = data.iloc[:,-1]
target.describe()

In [None]:
### Chekcing target varlable
fig, ax  = plt.subplots(1,2,figsize=(12,4))
fig.patch.set_facecolor( '#F2F2F2')

sns.histplot(target, bins=30, color='#8abbd0', kde=True, ax=ax[0])
ax[0].patch.set_facecolor( '#F2F2F2')
ax[0].lines[0].set_color('#F97A1F')

sns.boxplot(x=target, ax=ax[1], color='#8abbd0')
ax[1].patch.set_facecolor( '#F2F2F2')

plt.tight_layout()

## <font color='	#2E45B8'>  2.2 Chekcing numerical features</font>

In [None]:
# Checking statstical summary for numerical columns
data.describe()

In [None]:
# Checking correlation between different numerical featuers
fig = plt.figure(figsize=(15,12)) 
corr = data.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(data.corr(), cmap='Blues', mask=mask, linewidth=0.05, square=True)

In [None]:
# Identify feature pairs that have correlation > 0.8  and < - 0.8
correlation_table = pd.DataFrame(data.corr().unstack().sort_values().drop_duplicates(), columns=['correlation'])
correlation_table[(correlation_table['correlation'] >0.8) | (correlation_table['correlation'] < -0.8)].sort_values(by='correlation', ascending = False)

#### Comment
- The following pairs show strong correlation
- `GarageArea` & `GarageCars`:	~0.882475
- `YearBuilt` &	`GarageYrBlt`:	~0.825667
- `TotRmsAbvGrd` &	`GrLivArea`:	~0.825489
- `TotalBsmtSF` &	`1stFlrSF`:	~0.819530
- We will remove following columns [`1stFlrSF`, `GrLivArea`, `GarageYrBlt`, `GarageCars`]
- We will also drop the `Id` column
- We also see that some features have strong correlation with the target variable, we will drop features that have less impact on `SalePrice` after analysing numerical features in the next section.

In [None]:
cols_to_drop = ['1stFlrSF', 'GrLivArea', 'GarageYrBlt', 'GarageCars','Id']
data.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
# Plots on numerical features to check data quality and data distribution
num_cols = data.select_dtypes(exclude='object').columns.tolist()[:-1]
color ='#8abbd0'
for feature in num_cols:
    fig, ax = plt.subplots(1,4, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
    
    sns.regplot(x=feature, y='SalePrice', data=data, ax=ax[3], color=color, scatter_kws={"s": 8}, line_kws={'color':'#F97A1F'})
    correlation = np.corrcoef(data['SalePrice'], data[feature])[0][1]
    ax[3].text(x=0.95, y=0.9, 
               s = ['correlation=', "{:.2f}".format(correlation)], 
               ha='right', transform=ax[3].transAxes)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

#### Comment:
We have plotted charts to show features distribution (the hist plot and kde plot), feature statistics (the box plot) and its relationship with `SalePrice` (the scatter plot). We will examine each numerical feature one by one, and hopefully to eliminate features that are less likely to impact `SalePrice`

|Feature   |Drop/Keep   |Reason   |
|:-----|:----- |:----- |
|`MSSubClass`| Drop | low correlation with`SalePrice`|
|`LotFrontage`| Keep | has 'na' in the dataset. impute na values and check again|
|`LotArea`| Keep | remove outliers and check again|            
|`OverallQual` | Keep | strong correlation|
|`OverallCond` |Drop | weak correlation|
|`YearBuilt` | Keep | good correlation|
|`YearRemodAdd` |Keep | good correlation|
|`MasVnrArea`  | Drop | more than half of dataset is 0|
|`BsmtFinSF1`  | Drop | more than half of dataset is 0|
|`BsmtFinSF2` | Drop | majority value is zero|
|`BsmtUnfSF` | Drop | low correlation|
|`TotalBsmtSF` |Keep | good correlation|
|`2ndFlrSF` | Drop | more than half of dataset is 0|
|`LowQualFinSF` | Drop | more than half of dataset is 0|
|`BsmtFullBath` | Drop | weak correlation|
|`BsmtHalfBath` | Drop | weak correlation|
|`FullBath` | Keep | good correlation|
|`HalfBath` | Drop | low correlation|
|`BedroomAbvGr` | Drop | low correlation|
|`KitchenAbvGr` | Drop | low correlation|
|`TotRmsAbvGrd` | Keep | good correlation|
|`Fireplaces` | Drop | low correlation|
|`GarageArea` | Keep | good correlation|
|`WoodDeckSF` | Drop | more than half of dataset is 0|
|`OpenPorchSF` | Drop | more than half of dataset is 0|
|`EnclosedPorch` | Drop | more than half of dataset is 0|
|`3SsnPorch` | Drop | more than half of dataset is 0|
|`ScreenPorch` | Drop | more than half of dataset is 0|
|`PoolArea` | Drop | more than half of dataset is 0|
|`MiscVal` | Drop | more than half of dataset is 0|
|`MoSold` | Drop | low correlation|
|`YrSold` | Drop | low correlation|

Since there are two 'kept' features related to year, we will modify this to age instead. As maximum value for both `YearBuilt` and `YearRemodAdd` are 2010.  We assume that was the year that the data is released. We will replace `YearBuilt` and `YearRemodAdd` with `BuiltAge` and `RemodAge` by subtracting them from year 2010

After the above analysis, we will only keep the following numerical features: `LotFrontage`, `LotArea`, `OverallQual` ,`BuiltAge` , `RemodAge`, `TotalBsmtSF`,`FullBath`,`TotRmsAbvGrd`,`GarageArea`

In [None]:
# Modify features
data['BuiltAge'] = 2010 - data['YearBuilt']
data ['RemodAge'] = 2010 - data['YearRemodAdd']

In [None]:
# Numerical columns to keep
num_cols_to_keep = ['LotFrontage', 'LotArea', 'OverallQual' ,'BuiltAge' ,'RemodAge', 'TotalBsmtSF','FullBath', 'TotRmsAbvGrd', 'GarageArea']

## <font color='	#2E45B8'>  2.3 Chekcing categorical features</font>

In [None]:
# Plots on catagorical features to check data quality and data distribution
cat_cols = data.select_dtypes(include='object').columns.tolist()

#palette = ['#8abbd0', '#FB9851', '#36E2BD','#D0E1E1']

for feature in cat_cols:
    fig, ax = plt.subplots(1,4, figsize=(15,3))
    fig.patch.set_facecolor('#F2F2F2')
    
    sns.countplot(y=feature, data=data, alpha=0.8, ax=ax[0])
    sns.violinplot(y='SalePrice', x=feature, data=data, ax=ax[1])
    sns.boxplot(y='SalePrice', x=feature, data=data, ax=ax[2])
    sns.pointplot(y='SalePrice', x=feature, data=data, ax=ax[3])
    
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

#### Comment
Similar to the numerical feature analysis, we will examin the categorical features one by one, and remove less relevent features. We have a countplot, showing the total no. of elements within each categorical feature. The violinplot and box plots show the distribution of values at each level of the categorical variables. The mean values of elements within categorical feature are illustrated in pointplot and their average can be easily compared with. 

|Feature   |Drop/Keep   |Reason   |
|:-----|:----- |:----- |
|`MSZoning` | Drop | majority is `RL`|
|`Street`| Drop | majority is `Pave`|
|`LotShape` | Keep | `Reg` is the cheapest|
|`LandContour`| Drop | majority is `Lv1`|
|`Utilities` | Drop | majority is `AllPub`|
|`LotConfig`  | Drop | majority is `Inside`|
|`LandSlope`| Drop | majority is `Gd`|
|`Neighborhood` | Keep | some areas are significantly more expensive than others|
|`Condition1`| Drop | majority is `Gd`|
|`Condition2`| Drop | majority is `Norm`|
|`BldgType`| Drop | majority is `1Farm`|
|`HouseStyle` | Drop | majority is `2Story` and `1Story`|
|`RoofStyle` | Drop | majority is `Gable` and `Hip`|
|`RoofMatl` | Drop | majority is `Compshg`|
|`Exterior1st`| Drop | mean for different items does not vary much|
|`Exterior2nd`| Drop | mean for different items does not vary much|
|`MasVnrType` | Keep | `Stone` significantly more expensive than `None`|
|`ExterQual`| Keep | `Ex` is significantly more expensive than `TA`|
|`ExterCond`| Drop | majority is `TA`|
|`Foundation`| Keep | `PConc` is significantly more expensive than `CBlock`|
|`BsmtQual`| Keep | `Ex` is more expensive than others|
|`BsmtCond`| Drop | majority is `TA`|
|`BsmtExposure` | Keep | `No` is significantly cheaper than others|
|`BsmtFinType1`| Keep | `GLQ` is significantly more expensive than others|
|`BsmtFinType2`| Drop | majority is `Unf`|
|`Heating`| Drop | majority is `GasA`|
|`HeatingQC`| Keep | `Ex` is more expensive than others|
|`CentralAir`| Keep | `Y` is more expensive than `N`|
|`Electrical`| Keep | `SBrkr` is more expensive than `N`|
|`KitchenQual`| Keep | significant difference in mean|
|`Functional`| Drop | majority is `Typ`|
|`GarageType`| Keep | `Attchd` is more expensive than `Detchd`|
|`GarageFinish`| Keep | significant difference in mean|
|`GarageQual`| Drop | majority is `TA`|

We will keep the following categorical features: `LotShape`,`Neighborhood`,`MasVnrType`,`ExterQual`,`Foundation`,`BsmtQual`,`BsmtExposure`,`BsmtFinType1`,`HeatingQC`,`CentralAir`,`Electrical`,
`KitchenQual`,`GarageType`,`GarageFinish`,`PavedDrive`

In [None]:
# categorical columns to keep
cat_cols_to_keep = ['LotShape', 'Neighborhood', 'MasVnrType', 'ExterQual', 'Foundation' ,'BsmtQual',
                                     'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                                     'GarageType', 'GarageFinish', 'PavedDrive']

In [None]:
# create new dataframe that includes filtered features only
cols_to_keep = [num_cols_to_keep, cat_cols_to_keep, ['SalePrice']]
cols_to_keep = [element for sublist in cols_to_keep for element in sublist]
data1 = data[cols_to_keep]

In [None]:
print('The new dataframe has', len(data1.columns),'columns')

# <font color='	#2E45B8'> 3. Machine Learning</font>
## <font color='	#2E45B8'> 3.1 Data preprocessing</font>

- Scikit-learn only works with numerical number, so we will need to convert categorical features to numerical vallues
    - I will use LabelEncoder here
- There are some missing values
    - I will impute missing categorical values with mode
    - I will impute missing numerical values with mean

In [None]:
# Data preprocessing

# Extract new numerical and categorical columns
num_cols = data1.select_dtypes(exclude='object').columns.tolist()
cat_cols = data1.select_dtypes(include='object').columns.tolist()

# Convert categorical values to numerical values
le = LabelEncoder()
data1[cat_cols] = data1[cat_cols].astype('str').apply(le.fit_transform)

# Impute missing values for categorical features
mode_values=data1[cat_cols].mode()
data1[cat_cols] = data1[cat_cols].fillna(value=mode_values)

# Impute missing values for numerical features
median_values = data1[num_cols].median()
data1[num_cols] = data1[num_cols].fillna(value=median_values)

## <font color='	#2E45B8'> 3.2 Model prediction and evaluation</font>

We have selected a list of regressors, all with default hyperparameter settings. We will see which regressor has the best performance. 

In [None]:
X = data1.iloc[:, :-1]
y = data1.iloc[:, -1]

# Split train test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
# Function to determine feature importance for tree-based models
def feature_importance(name, reg):
    importances =reg.feature_importances_
    importance_table = np.array([[name], importances])
    importance_table=[element for sublist in importance_table for element in sublist]
    return importance_table

In [None]:
# Function to determine feature coefficient 
def feature_coefficient(name, reg):
    importances = reg.coef_
    importance_table = np.array([[name], importances])
    importance_table=[element for sublist in importance_table for element in sublist]
    return importance_table  

In [None]:
# Create and instantiate regressors
regs = [
            ('LinearReg', LinearRegression()),
            ('Ridge', Ridge()),
            ('Lasso', Lasso()),
#            ('SGD',SGDRegressor()),
            ('BayesianRidge', BayesianRidge()),
            ('RF', RandomForestRegressor()),
            ('DT', DecisionTreeRegressor()),
            ('GradientBoost', GradientBoostingRegressor()),
           ]

# Create empty lists to store model scores and feature coefficient/importance
scores = []
importance_tables = []

# Fitting and training models
for  name, reg in regs:
    # Train models
    reg.fit(X_train, y_train)  
    # Predict target value     
    y_pred = reg.predict(X_test) 
    
    # Summary of how different models perform
    score = reg.score(X_test, y_test)
    scores.append([name, score])
    
    # Result to show importance of different features
    if name == 'RF' or name =='DT':
        importance = feature_importance(name, reg)
        importance_tables.append(importance)
    if name in ['LinearReg', 'Ridge', 'Lasso', 'SGD', 'BayesianRidge']:
        importance = feature_coefficient(name, reg)
        importance_tables.append(importance)

In [None]:
# Compare model performance
scores = pd.DataFrame(scores)
scores.columns = ['Model', 'R2']
scores = scores.sort_values(by='R2',ascending=False)
pd.options.display.float_format = '{:.2f}'.format
scores

In [None]:
# Plot feature importance for different models
cols_name = [['Model'], X.columns]
cols_name=[element for sublist in cols_name for element in sublist]
importance_tables = pd.DataFrame(importance_tables, columns= cols_name)
importance_tables1 = importance_tables.set_index('Model')
    
fig = plt.figure(figsize=(15,18))
fig.patch.set_facecolor('#F2F2F2')

for i, (col_name, row) in enumerate(importance_tables1.iterrows()):
    indices = np.argsort(row)
    ax = plt.subplot(3,4, i+1)
    ax.barh(range(len(row)), row[indices], color='#8abbd0')
    ax.set_yticks(range(len(row)))
    ax.set_yticklabels(row.index[indices])    
    ax.set_title(col_name)
   
plt.suptitle('Feature Importance/coefficent for different models', 
                     y=0.99, va = 'top',
                     fontfamily='serif', fontsize=20, color='#173b56', fontweight='bold')
plt.tight_layout()

#### Comment
- Random forest is the best performing model
- `OverallQual` is the most important factor in determining house prices


### <font color='	#00c7c7'> Thanks for reading. Please upvote if you like this notebook. Thanks.</font>
