# Read Data

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.figure import Figure as fig
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import MaxNLocator
import seaborn as sns

import scipy
from scipy.stats import norm, skew
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Exploratory Data Analysis (EDA)

In [None]:
df=train.copy()
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.drop(['Id'],axis=1,inplace=True)
test.drop(['Id'],axis=1,inplace=True)

Now lets take a look at how the housing price is distributed

In [None]:
print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

* **With this information we can see that the prices are skewed right and some outliers lies above ~500,000. We will eventually want to get rid of the them to get a normal distribution of the independent variable (`SalePrice`) for machine learning.**

## Numerical data distribution

To do so lets first list all the types of our data from our dataset and take only the numerical ones:

In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
# Histogram to display skewness of every numeric variable
f = pd.melt(df_num, value_vars = df_num.columns)
g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=False)
g.map(lambda _x, **kwargs: sns.distplot(_x, fit = norm), 'value')

* **Features such as `1stFlrSF`, `TotalBsmtSF`, `LotFrontage`, `GrLiveArea`... seems to share a similar distribution to the one we have with `SalePrice`.**

## CATEGORICAL FEATURES

In [None]:
df_cat=df.select_dtypes(include='object')

In [None]:
df_cat.shape

In [None]:
df_cat.columns

In [None]:
df_cat.describe(include='all').T

In [None]:
def srt_box(y, df):
    fig, axes = plt.subplots(14, 3, figsize=(25, 80))
    axes = axes.flatten()

    for i, j in zip(df_cat[[col for col in df_cat.columns]], axes):

        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()

srt_box('SalePrice', df)

In [None]:
go.Figure([
    go.Violin(x=df["MSZoning"],y=df["SalePrice"],box_visible=True,name ="MSZoning")
]).update_layout(title="MSZoning",xaxis_title="MSZoning",yaxis_title="Prices",).show()

MSZoning:

* Floating village houses have the highest median price.
* Residential low density houses come second with some outliers.
* Residential high and low density are similar, while commercial has the lowest prices.

In [None]:
df["LandContour"].value_counts()

In [None]:
gb = df.groupby("LandContour")["SalePrice"].median()

go.Figure([
    go.Bar(y=gb.index,x=gb,orientation='h')
]).update_layout(title="LandContour",xaxis_title="Prices",yaxis_title="LandContour",).show()

LandContour:

* Hillside houses are more expensive than others.
* Banked houses are the least expensive.

In [None]:
go.Figure([
    go.Box(x=df["Neighborhood"],y=df["SalePrice"],name="Neighborhood")
]).update_layout(title="Neighborhood",xaxis_title="Neighborhood",yaxis_title="Prices",).show()

Neighborhood:

* Top 3 expensive areas: Northridge Heights, Northridge, Timberland.
* Above average: Somerset, Veenker, Crawford, Clear Creek, College Creek, Bloomington Heights.
* Sawyer West has a wide price range.
* Below average: Old Town, Edwards (with some outliers).
* Cheapest areas: Briardale, Iowa DOT & Rail Road, Meadow Village.

In [None]:
go.Figure([
    go.Scatter(x=df["Condition1"],y=df["SalePrice"],mode='markers',name="condition1"),
    go.Scatter(x=df["Condition2"],y=df["SalePrice"],mode='markers',name="condition2")
]).update_layout(title="Conditions",xaxis_title="Conditions",yaxis_title="Prices",).show()

Conditions:

* Proximity to North-South Railroad positively affects prices.
* Being near positive off-site features (parks, greenbelt) increases prices.

In [None]:
df["MasVnrType"].value_counts()

In [None]:
gb = df.groupby("MasVnrType")["SalePrice"].median()

go.Figure([
    go.Pie(labels=gb.index,values=gb,pull=[00.05,0.01,0])
]).update_layout(title="MasVnrType").show()

MasVnrType:

* Stone masonry veneer is priced higher than brick veneer.

In [None]:
df["CentralAir"].value_counts()

In [None]:
gb =df.groupby("CentralAir")["SalePrice"].median()

go.Figure([
    go.Pie(labels=gb.index,values=gb,pull=[00.05,0.0])
]).update_layout(title="CentralAir").show()

CentralAir:

* Central air system has a positive impact on sale prices.

In [None]:
df["GarageType"].value_counts()

In [None]:
gb = df.groupby("GarageType")["SalePrice"].median()

go.Figure([
    go.Bar(x=gb.index,y=gb, width=[.5,.5,.5,.5,.5,.5])
]).update_layout(title="GarageType",xaxis_title="GarageType",yaxis_title="Prices",).show()

GarageType:

* Built-in garages are the most expensive.
* Attached garages follow in price.
* Car ports are the least expensive.

# Data Preprocessing

## Handling Missing Values

In [None]:
all_data=pd.concat([train,test])

In [None]:
print(all_data.shape)
all_data = all_data.reset_index(drop=True)

In [None]:
all_data.drop(['SalePrice'],axis=1,inplace=True)

In [None]:
nan_all_data = (all_data.isnull().sum())*100/all_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df


In [None]:
#delet some features withvery high number of missing values.
all_data.drop(['PoolQC','Alley','Fence','Id','MiscFeature','FireplaceQu'],axis=1,inplace=True)

test.drop(['PoolQC','Alley','Fence','MiscFeature','FireplaceQu'],axis=1,inplace=True)
df.drop(['PoolQC','Alley','Fence','MiscFeature','FireplaceQu'],axis=1,inplace=True)

In [None]:
#Lot Frontage
print(df['LotFrontage'].dtype)
plt.scatter(x=np.log1p(df['LotFrontage']),y=df['SalePrice'])
print(df['LotFrontage'].describe())


* **there is some relation of LotArea with the SalePrice both by scatter plot and also by the corelation value. Therefore instead of deleting I will impute the values with the mean.**

In [None]:
all_data['LotFrontage'].fillna(np.mean(all_data['LotFrontage']),inplace=True)

In [None]:
# these features eg like garage qual,cond,finish,type seems to be important and relevant for buying car.
# fill them with the 'none' for categorical and 0 for numeric as nan here implies that there is no garage.

all_data['GarageYrBlt'].fillna(0,inplace=True)

all_data['GarageArea'].fillna(0,inplace=True)

all_data['GarageCars'].fillna(0,inplace=True)

all_data['GarageQual'].fillna('None',inplace=True)

all_data['GarageFinish'].fillna('None',inplace=True)

all_data['GarageCond'].fillna('None',inplace=True)

all_data['GarageType'].fillna('None',inplace=True)

In [None]:
# basement related features.
# missing values are likely zero for having no basement

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col].fillna(0,inplace=True)

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col].fillna('None',inplace=True)



In [None]:
# MasVnrArea 0 and MasVnrType 'None'.
all_data['MasVnrArea'].fillna(0,inplace=True)

all_data['MasVnrType'].fillna('None',inplace=True)

In [None]:
#other rem columns are all cat like kitchen qual etc.. and so filled with mode.
for col in ['MSZoning','Functional','SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical','Utilities']:
  all_data[col].fillna(all_data[col].mode()[0],inplace=True)

In [None]:
nan_all_data = (all_data.isnull().sum())*100/all_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df



**Finally no null values**

## Encode the Categorical Features

In [None]:
#one hot encoding using OneHotEncoder of Scikit-Learn

from sklearn.preprocessing import OneHotEncoder


categorical_columns = all_data.select_dtypes(include=['object']).columns.tolist()

encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(all_data[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([all_data, one_hot_df], axis=1)

all_data = df_encoded.drop(categorical_columns, axis=1)

all_data


In [None]:
# for col in all_data.columns:
#     if(all_data[col].dtype == 'object'):
#         le=LabelEncoder()
#         all_data[col]=le.fit_transform(all_data[col])

In [None]:
train=all_data.loc[:(df.shape)[0]+2,:]
test=all_data.loc[(df.shape)[0]+2:,:]

In [None]:
train['SalePrice']=df['SalePrice']
train['SalePrice'].fillna(np.mean(train['SalePrice']),inplace=True)
train.shape
print(train['SalePrice'].isnull().sum())

In [None]:
print(train.shape)
print(test.shape)

## Handling Skewness

In [None]:
# #log transform skewed numeric features:

# train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
train.head()

## Handling Outliers

In [None]:
def remove_outliers(df, column):

  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
  return df_no_outliers

print(f"Original shape: {train.shape}")
train = remove_outliers(train, 'SalePrice')
train = remove_outliers(train, 'LotFrontage')
train = remove_outliers(train, 'TotalBsmtSF')
train = remove_outliers(train, 'LotArea')
train = remove_outliers(train, 'GrLivArea')
train = remove_outliers(train, 'GarageArea')
print(f"Shape after outlier removal: {train.shape}")


## Correlation

we'll try to find which features are strongly correlated with `SalePrice`. We'll store them in a var called `golden_features`.

In [None]:
cat_corr = train.corrwith(train['SalePrice'], method='pearson')
golden_features = cat_corr[abs(cat_corr) > 0.4].sort_values(ascending=False)
golden_features = golden_features.drop('SalePrice')
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features), golden_features))

In [None]:
corr_spearman = train.corrwith(train['SalePrice'], method='kendall')
golden_features_sp = cat_corr[abs(cat_corr) > 0.4].sort_values(ascending=False)
golden_features_sp = golden_features_sp.drop('SalePrice')
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_sp), golden_features_sp))

In [None]:
cor_mat= train[list(golden_features.index) + ['SalePrice']].corr()
fig, ax = plt.subplots()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,annot=True,cbar=True,fmt='.2f',cmap='Blues')

* **we now have a list of strongly correlated values**

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

## Scaling Features

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_train

# Regression Models

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import math

#### LINEAR REGRESSION

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)

In [None]:
print('LinearRegressio Train : ')
y_pred = lr.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

In [None]:
print('LinearRegressio Test : ')
y_pred = lr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

#### LASSO

In [None]:
reg_lasso=Lasso()
reg_lasso.fit(X_train,y_train)

In [None]:
print('Lasso Train : ')
y_pred = reg_lasso.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

In [None]:
print('Lasso Test : ')
y_pred = reg_lasso.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

#### RIDGE

In [None]:
reg_ridge=Ridge()
reg_ridge.fit(X_train,y_train)

In [None]:
print('Ridge Train : ')
y_pred = reg_ridge.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

In [None]:
print('Ridge Test : ')
y_pred = reg_ridge.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))