<div class="alert alert-block alert-info">  
    <h1><strong>👨‍💻 Getting Started with House Price Predictions</strong></h1>
    <i></i>
</div>

# <img src="https://www.mashvisor.com/blog/wp-content/uploads/2019/01/bigstock-Paper-House-Model-On-Coins-Sta-279182236-e1547969753106.jpg">

# Importing Python Libraries 📕 📗 📘 📙

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.gridspec as gridspec
import missingno as msno
import scipy.stats as stats 
from scipy.special import boxcox1p
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading training data</strong></h1>
    <i></i>
</div>

In [None]:
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

# Exploratory data analysis of train data

# Five top records of data

In [None]:
train_data.head()

# Five last records of data

In [None]:
train_data.tail()

# Coloumns/features in data

In [None]:
train_data.columns

# Length of data

In [None]:
print('lenght of data is', len(train_data))

# Shape of data

In [None]:
train_data.shape

# Data information

In [None]:
train_data.info()

# Data types of all coloumns

In [None]:
train_data.dtypes

# Checking missing Values

In [None]:
train_data[train_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(train_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
train_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
train_data.isnull().sum()

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading testing data</strong></h1>
    <i></i>
</div>

In [None]:
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
ids_test_data = test_data['Id'].values

# Exploratory data analysis of test data

# Five top records of data

In [None]:
test_data.head()

# Five last records of data

In [None]:
test_data.tail()

# Coloumns/features in data

In [None]:
test_data.columns

# Length of data

In [None]:
print('lenght of data is', len(test_data))

# Shape of data

In [None]:
test_data.shape

# Data information

In [None]:
test_data.info()

# Data types of all coloumns

In [None]:
test_data.dtypes

# Checking missing Values

In [None]:
test_data[test_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(test_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
test_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
test_data.isnull().sum()

# Looking at the train data missing values.

In [None]:
NANColumns=[]
i=-1
for a in train_data.isnull().sum():
    i+=1
    if a!=0:
        print(train_data.columns[i],a)
        NANColumns.append(train_data.columns[i])

# Looking at the test data missing values.

In [None]:
NANColumns=[]
i=-1
for a in test_data.isnull().sum():
    i+=1
    if a!=0:
        print(test_data.columns[i],a)
        NANColumns.append(test_data.columns[i])

# Hitogram of all columns where we are going to check that how the values of each column distributed with their counts

In [None]:
train_data.hist(figsize=(50,50),bins = 20, color="#107009AA")
plt.title("Features/Columns Distribution with values counts")
plt.show()

# Looking at the Temporal data (temporal data is relating to time data, we are here looking at the past data) 

In [None]:
temporal_features = [feat for feat in train_data if "Year" in feat or "Yr" in feat]
print(temporal_features)
for feature in temporal_features:
    sns.scatterplot(x=feature,y="SalePrice",data=train_data)
    plt.title(feature)
    plt.show()

# Looking at the Discrete and Continuous features

In [None]:
#HERE condition less than 6 is the unique count of temporal features.  "rain_data[feature].nunique() < 6" return true
discrete_features = [feature for feature in train_data if train_data[feature].nunique() < 6 and feature not in temporal_features]
continuous_features = [feature for feature in train_data if feature not in discrete_features and feature not in temporal_features]

print("Discrete_Features:\n",discrete_features)
print("Continuous_Features:\n",continuous_features)

# Scatter plot of each feature against Sale price on discrete features

In [None]:
def scatterplot(df,feature,target_feature):
    plt.figure(constrained_layout=True)
    sns.scatterplot(df[feature],df[target_feature])
    plt.title(feature)
    plt.show()
for feat in discrete_features:
    scatterplot(train_data,feat,"SalePrice")


<div class="alert alert-block alert-danger">  
<h2><center><strong>As we can see from the graphs, OverallQual, OverallCond, FullBath, TotRmsAbvGrd and GarageCars have stong correlation with SalePrice</strong></center></h2>
        
</div>

# Scatter plot of each feature against Sale price on continuous features

In [None]:
for feat in continuous_features:
    scatterplot(train_data,feat,"SalePrice")

As we can see from the graphs, **TotalBsmtSF**, **1stFlrSF**, **GrLivArea** and **GarageArea** have stong correlation with **SalePrice**

<div class="alert alert-block alert-danger">  
<h2><center><strong>As we can see from the graphs, TotalBsmtSF, 1stFlrSF, GrLivArea and GarageArea have stong correlation with SalePrice</strong></center></h2>
        
</div>

# Looking at the top 10 most correlated features with SalePrice 

In [None]:
corr_feat = train_data.corr().nlargest(10,"SalePrice")["SalePrice"].index
cmap = np.corrcoef(train_data[corr_feat].values.T)
mask = np.zeros_like(cmap,dtype=bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,10))
sns.heatmap(cmap,
            annot=True,
            fmt=".3f",
            annot_kws = {"size":10},
            cmap=sns.cubehelix_palette(),
            xticklabels = corr_feat.values,
            yticklabels = corr_feat.values,
            mask=mask)

<div class="alert alert-block alert-info">  
<h2><center><strong>Data Processing</strong></center></h2>
        
</div>

## Extract the SalePrice out

In [None]:
y = train_data["SalePrice"]

## Combining the train and test dataset

In [None]:
all_data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)

## Drop the SalePrice & Id columns

In [None]:
all_data = all_data.drop(["SalePrice","Id"],axis=1)

## A function for checking the missing values

In [None]:
def missing_value(df):
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    percentage = df.isnull().sum() *100 / df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    return  pd.concat([number,percentage],keys=["Total","Percentage"],axis=1)
missing_value(all_data)

## Imputing the Missing Values of all data

In [None]:
missing_col = ["Alley", "PoolQC", "MiscFeature","Fence",
               "FireplaceQu","GarageType","GarageFinish",
               "GarageQual","GarageCond",'BsmtQual','BsmtCond',
               'BsmtExposure','BsmtFinType1','BsmtFinType2',
               'MasVnrType']

for col in missing_col:
    all_data[col] = all_data[col].fillna("None") 

In [None]:
#LotFrontage, Houses in the same neighborhood would have similar lotfrontage area. 
## filling the numerical features with median and mdeidan is the best suited method for numerical based features
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x:x.fillna(x.median()))

In [None]:
#MasVnrArea, Same apply to the MasVnrArea
## filling the numerical features with median and mdeidan is the best suited method for numerical based features
all_data["MasVnrArea"] = all_data.groupby("Neighborhood")["MasVnrArea"].transform(lambda x:x.fillna(x.median()))

In [None]:
## MSSubClass
## Imputing the missing values with the Mode because mode fill the values with the most accuring values and best for the categorical features
all_data["MSZoning"] = all_data.groupby("MSSubClass")["MSZoning"].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
## GarageYrBlt
all_data.loc[all_data["GarageFinish"] == "None" , "GarageYrBlt"] = all_data["YearBuilt"]

In [None]:
## Check on the missing value
missing_value(all_data)

In [None]:
### for the rest of the missing value
## categorical feature are replaced with the mode value
## numerical feature are replaced with the median value
missing_feat = missing_value(all_data).index

In [None]:
## getting categorical feature
missing_cat = [feat for feat in missing_feat if all_data[feat].dtype == np.object]

In [None]:
## filling the categorical features with mode and mode is the best suited method for categorical based features
for feat in missing_cat:
    all_data[feat] = all_data[feat].transform(lambda x: x.fillna(x.mode()[0]))

## numerical feature
missing_num = [feat for feat in missing_feat if feat not in missing_cat]

In [None]:
## filling the numerical features with median and mdeidan is the best suited method for numerical based features
for feat in missing_num:
    all_data[feat] = all_data[feat].transform(lambda x: x.fillna(x.median()))  
### Check on the missing value
missing_value(all_data)

In [None]:
### Months ans years should be consider as categorical features
all_data["MoSold"] = all_data["MoSold"].astype(str)
all_data["YrSold"] = all_data["YrSold"].astype(str)
all_data["YearBuilt"] = all_data["YearBuilt"].astype(str)

### Normalizing the of Dependant Variable SalePrice

In [None]:
## Visualization
fig = plt.figure(constrained_layout=True, figsize=(12,8))
grid = gridspec.GridSpec(ncols=3, nrows=4, figure=fig)
 # Histrogram
ax1 = fig.add_subplot(grid[0,:])
sns.distplot(y,ax=ax1)
ax1.set_title("Histrogram of SalePrice")
# QQplot
ax2 = fig.add_subplot(grid[2:,:2])
stats.probplot(y,plot=ax2)
ax2.set_title("QQplot of SalePrice")
 # Boxplot
ax3 = fig.add_subplot(grid[2:,2])
sns.boxplot(y,ax=ax3,orient="v")
ax3.set_title("Boxplot of SalePrice")
plt.show()

<div class="alert alert-block alert-danger">  
<h2><center><strong>The above graphs show the following points</strong></center></h2>
    <li>The SalePrice is drawn from a normal distribution</li>
<li>The SalePrice is right skewed/ postively skewed, which indicates that most people are able to afford lower priced house.</li>
<li>Present some mutliple outliers in SalePrice</li>
        
</div>

In [None]:
##Check on the kurtosis & the skewness of SalePrice
print("Kurtosis: {}".format(y.kurt()))
print("Skewness: {}".format(y.skew()))

<div class="alert alert-block alert-danger">  
<h2><center><strong>As indicated in the three charts above, SalePrice is postively-skewed. SalePrice is drawn from a Leptokurtic (distributions with wider tails, greater profusion of outliers) distributions.</strong></center></h2>
    <li>Skewness: Defined as the degree of distortion from the symmetrical bell curve or the normal curve.</li>
<li>Kurtosis: Defined as the measuer of the extreme values (also known as outliers) present in the distribution.</li>
        
</div>

In [None]:
## Normalize the Dependant Variable(SalePrice)
y = np.log1p(y)

## Visualize of SalePrice after the normalization
fig,(ax1,ax2) = plt.subplots(2,1,constrained_layout=True,figsize=(12,9))

 # Histrogram
sns.distplot(y,ax=ax1)
ax1.set_title("Histrogram of SalePrice")
 # QQplot
stats.probplot(y,plot=ax2)
ax2.set_title("QQplot of SalePrice")

plt.show()

<div class="alert alert-block alert-danger">  
<h2><center><strong>Its better now !!</strong></center></h2>
    <li>Now let's check on the kurtosis and skewness value of SalePrice</li>
        
</div>

In [None]:
## Kurtosis and skewness of SalePrice
print("Kurtosis: {}".format(y.kurt()))
print("Skewness: {}".format(y.skew()))

## Normalizing the of Independant Variables

In [None]:
## Check on the skewness and the kurtosis on continuos data only
numerical_feats = [feat for feat in all_data.columns if all_data[feat].dtype != np.object]
skewness = all_data[numerical_feats].skew().sort_values(ascending=False)
kurtosis = all_data[numerical_feats].kurt().sort_values(ascending=False)

df_norm = pd.concat([skewness,kurtosis],axis=1,keys=["Skewness","Kurtosis"])

df_norm

In [None]:
### Feature with skewness greater than 0.5 or lower than -0.5 are considered highly skewed
high_skew = skewness[abs(skewness) > 0.5].sort_values(ascending=False)

## Visualization of TotalBsmtSF
plt.figure(figsize=(8,6))
sns.distplot(all_data["TotalBsmtSF"])
plt.show()

In [None]:
## Look at its kurtosis and skewness value
print("Kurtosis: {}".format(all_data["TotalBsmtSF"].kurt()))
print("Skewness: {}".format(all_data["TotalBsmtSF"].skew()))

In [None]:
## import packages
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


## Normalization of independant variables
for feat in high_skew.index:
    all_data[feat] = boxcox1p(all_data[feat], boxcox_normmax(all_data[feat] + 1))
## Visualization of TotalBsmtSF after normalization
plt.figure(figsize=(8,6))
sns.distplot(all_data["TotalBsmtSF"])
plt.show()


In [None]:
## Look at its kurtosis and skewness value after the normalization
print("Kurtosis: {}".format(all_data["TotalBsmtSF"].kurt()))
print("Skewness: {}".format(all_data["TotalBsmtSF"].skew()))

### Now the data is normalized well !!

<div class="alert alert-block alert-success">  
<h2><center><strong> Adding the new features from the existing featrures because to make the model more distinguish for price predictions  !!</strong></center></h2>
        
</div>

In [None]:
## TotalHouseSF: The total Square Foot of the house
all_data["TotalHouseSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

In [None]:
## TotalBath: The total number of bathrooms in the house
all_data["TotalBath"] = all_data["BsmtFullBath"] + all_data["BsmtFullBath"]*0.5 + all_data["FullBath"] + all_data["HalfBath"]*0.5

In [None]:
## TotalPorchSF: The total square foot of porch area of the house
all_data["TotalPorchSF"] = all_data["WoodDeckSF"] + all_data["OpenPorchSF"] + all_data["EnclosedPorch"] + all_data["3SsnPorch"] + all_data["ScreenPorch"] 

In [None]:
## HouseRemodAge: Number of years the house being remodded to the time it was sold
all_data["HouseRemodAge"] = all_data["YrSold"].astype(int) - all_data["YearRemodAdd"]
all_data.loc[all_data["HouseRemodAge"] < 0, "HouseRemodAge"] = 0 

In [None]:
## function 
presence = lambda x: 1 if x > 0 else 0

In [None]:
## HasPool: Presence of pool
all_data["HasPool"] = all_data["PoolArea"].transform(presence)

In [None]:
## Has2ndFlr: Presence of second floor
all_data["Has2ndFlr"] = all_data["2ndFlrSF"].transform(presence)

In [None]:
## HasGarage: Presence of garage
all_data["HasGarage"] = all_data["GarageArea"].transform(presence)

In [None]:
## HasBsmt: Presence of basement
all_data["HasBsmt"] = all_data["TotalBsmtSF"].transform(presence)

In [None]:
## HasFirePlace: Presence of fireplace
all_data["HasFirePlace"] = all_data["Fireplaces"].transform(presence)

## Deleting the Biased Features


In [None]:
## Bias feature reducer
bias_feat = []
for feat in all_data.columns:
    counts = all_data[feat].value_counts().iloc[0] ## mode value counts
    if counts / len(all_data) * 100 > 99.94:
        bias_feat.append(feat)

bias_feat

In [None]:
## Remove the bias feature from the dataset
all_data = all_data.drop(bias_feat,axis=1)

## Coverting the categorical features into numeric form by applying the get_dummies function

In [None]:
all_data = pd.get_dummies(all_data).reset_index(drop=True)

# Now splitting the data for training and testing with same index ID's

In [None]:
n = len(y)
train_data = all_data[:n]
test_data = all_data[n:]

# Splitting the Train data into 70% for training and 30% for testing 

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(train_data,y,test_size=0.33,random_state=42)
print("Shapes of data: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

<div class="alert alert-block alert-info">  
<h2><center><strong> Building the models for training and testing</strong></center></h2>
        
</div>

In [None]:
## Create an empty list
pipeline_models = []

# Assign all models into the list
seed = 42
models = [Ridge(tol=10,random_state=seed),
          Lasso(tol=1,random_state=seed),
          RandomForestRegressor(random_state=seed),
          ExtraTreesRegressor(random_state=seed),
          GradientBoostingRegressor(),
          DecisionTreeRegressor(),
          KNeighborsRegressor()]

model_names = ["Ridge","Lasso","RFR","ETR","GBoost_Reg","DT_Reg","KNN_Reg"]

## Assign each model to a pipeline
for name, model in zip(model_names,models):
    pipeline = ("Scaled_"+ name,
                Pipeline([("Scaler",StandardScaler()),
                          (name,model)
                         ]))
    pipeline_models.append(pipeline)

<div class="alert alert-block alert-info">  
<h2><center><strong> Training the models</strong></center></h2>
        
</div>

In [None]:
## Create a dataframe to store all the models' cross validation score
evaluate = pd.DataFrame(columns=["model","cv","std"])


## Encoded dataset
for name,model in pipeline_models:
    kfold = KFold(n_splits=7,shuffle=True,random_state=42)
    cv = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, scoring="r2")
    
    row = evaluate.shape[0]
    evaluate.loc[row,"model"] = name
    evaluate.loc[row,"cv"] = round(cv.mean(),3)
    evaluate.loc[row,"std"] = "+/- {}".format(round(cv.std(),4))
    
    evaluate = evaluate.sort_values("cv",ascending=False)

In [None]:
## Visualization
fig, ax = plt.subplots(1,1,sharey=False,figsize=(16,9))

## Encoded dataset
bar = sns.barplot(evaluate["model"], evaluate["cv"],ax=ax,palette = sns.cubehelix_palette(evaluate.shape[0]))
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height*1.02,height,ha="center")
ax.set_title("Cross Validate Score")
ax.set_xticklabels(evaluate["model"].to_list(),rotation =50)

<div class="alert alert-block alert-danger">  
<h2><center><strong> Best Model is Gradient Boosting Regressor</strong></center></h2>
        
</div>

In [None]:
final_model = GradientBoostingRegressor()
final_model = final_model.fit(X_train,y_train)

<div class="alert alert-block alert-success">  
<h1><center><strong> Submitting the predicted prices of house on test data</strong></center></h1>
        
</div>

In [None]:
submission_results = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission_results.iloc[:,1] = np.floor(np.expm1(final_model.predict(test_data)))
submission_results.to_csv('submission_results', index=False)

# <img src="https://thumbs.dreamstime.com/t/bright-colorful-thank-you-banner-vector-overlapping-letters-118244535.jpg">