### Objective :
*Predict sales prices, practice more detailled pre-processing and train with Stacking model !*


<p style="background-color:skyblue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Importing Libraries & Data</b></p>


In [None]:
# essentials
import numpy as np
import pandas as pd

#plots
import matplotlib.pyplot as plt
import seaborn as sns

#stat
from scipy import stats
from scipy.stats import norm, skew #for some statistics

#warnings
import warnings
warnings.filterwarnings('ignore')

# split
from sklearn.model_selection import train_test_split, KFold

# model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing  import  RobustScaler, StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor 
from mlxtend.regressor import StackingCVRegressor
from lightgbm import LGBMRegressor

# evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

# max displayed rows/columns
pd.set_option('display.max_column', 200)
pd.set_option('display.max_rows', 1460)

In [None]:
# train and test files
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

# submission file
sub = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

<a id="head-1-2"></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;"><b>EDA  🔍 🔍</b></p>

- **Target** : SalePrice
- **Number of rows and columns** : (1460, 80)
- **Type of variables**:

     Qualitative : 43  
     Quantitative : 37
     
     
- **Nan values** : more than 80% in the following columns :
PoolQC, MiscFeature, Alley and Fence 


In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.dtypes.value_counts()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
missing_perc = train.isna().sum()/train.shape[0] * 100
with_miss = missing_perc[missing_perc > 0].sort_values(ascending=False)
with_miss

In [None]:
plt.figure(figsize=(12,6))
plt.xticks(rotation=45)
sns.barplot(x=with_miss.index, y=with_miss)

#### Target visualization :

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train.SalePrice)

In [None]:
print("Skewness: %f" % train['SalePrice'].skew())

Our target variable does not have a normal distribution!

In [None]:
# let's create a function to plot the distribution and qq-plot for comparison
def plotHistProb() :
    # Plot histogram  
    fig = plt.figure(figsize=(15,5))
    plt.subplot(1,2,1)
    sns.distplot(train['SalePrice'] , fit=norm);
    (mu, sigma) = norm.fit(train['SalePrice'])
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
    plt.ylabel('Frequency')
    plt.title('SalePrice distribution')

    # plot probability --> qq-plot
    plt.subplot(1,2,2)
    res = stats.probplot(train['SalePrice'], plot=plt)
    
    
plotHistProb()

So yeah our target is right-skewed! And we are going to correct it now.

In [None]:
train.SalePrice = np.log1p(train.SalePrice)

In [None]:
plotHistProb()

#### Understand variables :

In [None]:
#finding the unique values in each column (type object)
for col in train.select_dtypes('O').columns:
    print('We have {} unique values in {} column : {}'.format(len(train[col].unique()),col,train[col].unique()))
    print('__'*30)

Let's examine now the correlation between dependent and independent variables

In [None]:
plt.figure(figsize=(30,15))

#sett a palette
cmap = sns.diverging_palette(180, 30, as_cmap=True)

sns.heatmap(train.corr(),
           square=True,
           cmap='PiYG',
           mask=np.triu(train.corr()))

What we really need from this heatmap? we need the highest correlation between features and SalesPrice, so let's do it.

In [None]:
corr = train.corr()
highest_corr_features = corr.index[abs(corr["SalePrice"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(train[highest_corr_features].corr(),annot=True, cmap='RdYlGn')

Let's examine more the highest correlated features :

In [None]:
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols])

<a id="head-1-2"></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;"><b>Pre-processing</b></p>

- Imputation
- Feature Engineering
- Encoding
- Skewness / Normalization 
- Outliers / Columns full with zeros
- Train-Test split


In [None]:
df = train.append(test).reset_index(drop=True)

### Imputation :

In [None]:
df.isna().sum()[df.isna().sum()>0]

In [None]:
# Filling Categorical NaN (That we know how to fill due to the description file )
df['Functional'] = df['Functional'].fillna('Typ')
df['Electrical'] = df['Electrical'].fillna("SBrkr") # Standard Circuit
df['KitchenQual'] = df['KitchenQual'].fillna("TA")

df['Exterior1st'] = df['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(train['SaleType'].mode()[0])

for col in ["PoolQC", "Alley", 'FireplaceQu', 'Fence', 'MiscFeature', 'GarageType', 'GarageFinish', 'GarageQual', 
            'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', "MasVnrType"] :
    
    df[col] = df[col].fillna("None")


for col in ('GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtHalfBath', 
            'TotalBsmtSF', "MasVnrArea"):
    df[col] = df[col].fillna(0) # no basement or no garage

df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
to_drop = ['GarageYrBlt','YearRemodAdd', 'Utilities'] 
df = df.drop(to_drop, axis = 1)

We will check again if there is more Nan values :

In [None]:
df.isna().sum()[df.isna().sum()>0]

### Feature Engineering :

It seems like adding total sqfootage and Total Home Quality features will improve the result

In [None]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['Total_Home_Quality'] = df['OverallQual'] + df['OverallCond']

Let's convert non-numeric predictors stored as numbers into string :

In [None]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['OverallCond'] = df['OverallCond'].astype(str)
df['YrSold'] = df['YrSold'].apply(str)
df['MoSold'] = df['MoSold'].apply(str)

### Encoding :

In [None]:
# Creating dummy variables from categorical features
df = pd.get_dummies(df)

In [None]:
df.head()

### Skewness / Normalization :

Let's Fix now The Skewness in the other features, then normalize it

In [None]:
numeric_features = df.dtypes[df.dtypes != object].index
skewed_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index


# Normalize skewed features using log_transformation
    
for i in skew_index:
    df[i] = np.log1p(df[i])

### Outliers / Columns full with zeros:

In [None]:
outliers = [30, 88, 462, 631, 1322]

df = df.drop(df.index[outliers])
overfit = []
for i in df.columns:
    counts = df[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(df) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
df = df.drop(overfit, axis=1)
#X_sub = X_sub.drop(overfit, axis=1)

### Train-test split :

In [None]:
n = len(train)
train = df[:n-5]
test = df[n-5:]

In [None]:
X = train.drop('SalePrice', axis = 1)
y = train.SalePrice
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

<a id="head-1-2"></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;"><b>Modeling and Evaluation</b></p>



In [None]:
base_models = (KernelRidge(),
               make_pipeline(RobustScaler(),Lasso(alpha=0.0005, random_state=1)),
               make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005, l1_ratio=0.9)),             
               make_pipeline(StandardScaler(),GradientBoostingRegressor(learning_rate=0.005, 
                                                                        loss='huber',
                                                                        max_depth=4, 
                                                                        max_features='sqrt',
                                                                        min_samples_leaf=15,
                                                                        min_samples_split=10,
                                                                        n_estimators=3000,
                                                                        random_state=1)))

In [None]:
meta_model = LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, 
                           feature_fraction=0.2319, feature_fraction_seed=9,
                           learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
                           min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
                           bagging_seed=9,objective='regression')

In [None]:
#Building the stacking model
kfold = KFold(n_splits=10)

stack = StackingCVRegressor(regressors=base_models,
                            meta_regressor=meta_model, 
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,cv=kfold,
                            random_state=1)

In [None]:
#fitting the model to our data
stack.fit(x_train,y_train)
predictions = stack.predict(x_test)

MSE = mean_squared_error(y_test,predictions)
RMSE = np.sqrt(MSE)
RMSE

In [None]:
pred_sub = stack.predict(test.drop('SalePrice', axis = 1))
sub.SalePrice = np.expm1(pred_sub)

In [None]:
sub.to_csv('submission.csv' , index=False)

#### If you find this nnotebook useful, please don't forget to upvote it!