In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
train_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_df.shape


# Data Inspection

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.drop(['Id'],  axis = 1, inplace = True)
train_df.shape

In [None]:
train_df.describe()

In [None]:
# Creating dataframe for numeric and categorical features separately 
numerical=train_df.select_dtypes(exclude=['object'])
categorical=train_df.select_dtypes(include=['object'])
numerical.columns

In [None]:
#Correlation Analysis
plt.figure(figsize=(20, 10))
sns.heatmap(numerical.corr(),  annot=True)

Some features have have high correlation with each other(more than 0.6):

GarageYrBlt with YearBuilt 

GarageYrBlt with YearRemodAdd 

1stFlrSF with TotalBsmtSF 

GarageArea with GarageCars

TotRmsAbvGrd with GrLivArea 

1stFlrSF with GrLivArea 

BedroomAbvGr with TotRmsAbvGrd 

TotRmsAbvGrd with 2ndFlrSF


So it is better to drope one feature from each of these.

Also we can see, some columns have a correlation score above 0.5 with SalePrice which is a good indication of using as predictors. Let's see these columns and plot them against SalePrice:

In [None]:
numerical.corr()["SalePrice"].sort_values(ascending=False)

In [None]:
highest_corr_features=train_df.corr().index[abs(train_df.corr()["SalePrice"])>0.5]
highest_corr_features

In [None]:
train_df.info()

In [None]:
 #Scatter Plot of Saleprice with columns:  OverallQual,  GrLivArea, GarageArea, TotalBsmtSF, FullBath, YearBuilt, YearRemodAdd,,TotRmsAbvGrd,1stFlrSF,GarageCars     
var_ind = [16,45,60, 61,37,48,18,19,42,53,]
plot = plt.figure(figsize = (12, 12))
plot.subplots_adjust(hspace = 0.9, wspace = 0.6)
for i in range(1,11):
    a = plot.add_subplot(4,3 , i)
    a.scatter(x = train_df.iloc[: , var_ind[i - 1]], y = train_df.iloc[: , 79], alpha = 0.5)
    a.title.set_text('Saleprice vs. ' + train_df.columns[var_ind[i - 1]])

In [None]:
#boxplot of SalePrice, OverallQual,  GrLivArea, GarageArea, TotalBsmtSF, FullBath, YearBuilt, YearRemodAdd,,TotRmsAbvGrd,1stFlrSF,GarageCars 
l = [16,45,60, 61,37,48,18,19,42,53,79]
plot = plt.figure(figsize = (12, 12))
plot.subplots_adjust(hspace = 0.5, wspace = 0.5)
for i in range(1, 12):
    a = plot.add_subplot(4, 3, i)
    a.boxplot(train_df.iloc[: , l[i - 1]])
    a.title.set_text(train_df.columns[l [i - 1]])

As we can see there are some outiers in these features.

# Drop Outliers

In [None]:
#Let's see the outliers percentage in above column
for k, v in  train_df.iloc[: , l].items():
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        irq = q3 - q1
        v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
        perc = np.shape(v_col)[0] * 100.0 / np.shape(train_df.iloc[: , l])[0]
        print("Column %s outliers = %.2f%%" % (k, perc))

In [None]:
#outlier between saleprice and GrLivArea
sns.scatterplot(data=train_df , x='GrLivArea',y='SalePrice')
plt.axhline(y=200000, color='black')
plt.axvline(x=4600, color='red')


In [None]:
data1 = train_df[~((train_df['GrLivArea']>4500)&(train_df['SalePrice']<200000))]
print(np.shape(data1))

In [None]:
#outlier between saleprice and TotalBsmtSF
sns.scatterplot(data=data1 , x='TotalBsmtSF',y='SalePrice')
plt.axhline(y=500000, color='black')
plt.axvline(x=3000, color='red')

In [None]:
data1 = data1[~((data1['TotalBsmtSF']>3000)&(data1['SalePrice']<=500000))]
print(np.shape(data1))

In [None]:
#outlier between saleprice and GarageArea
sns.scatterplot(data=data1 , x='GarageArea',y='SalePrice')
plt.axhline(y=260000, color='black')
plt.axvline(x=1240, color='red')

In [None]:
data1 = data1[~((data1['GarageArea']>1240)&(data1['SalePrice']<260000))]
print(np.shape(data1))

# Featues selections and Looking for Missing Data

In [None]:
data1.isna().sum().sort_values(ascending=False)

In [None]:
# Percentage of Null values
null_values=[(i,data1[i].isna().mean()*100) for i in data1]
null_df=pd.DataFrame(null_values,columns=['column_name','percentage'])
null_df


In [None]:
null_df[null_df['percentage']>40].sort_values('percentage')

We drop these features(These features have correlation less than 0.5 and are not important)

In [None]:
data1.drop([ 'Alley', 'FireplaceQu','PoolQC','Fence','MiscFeature'],  axis = 1, inplace = True)
data1.shape

In [None]:
#Removing one of the high correlated from pairs checked using heatmap
data1.drop([ '1stFlrSF', 'TotRmsAbvGrd','GarageCars','GarageYrBlt'],  axis = 1, inplace = True)
data1.shape

In [None]:
numerical1=data1.select_dtypes(exclude=['object'])
categorical1=data1.select_dtypes(include=['object'])
categorical1.columns

In [None]:
for feature in numerical1.columns:
    data1[feature]=data1[feature].fillna(data1[feature].mean())
    


In [None]:
for feature in categorical1.columns:
    data1[feature]=data1[feature].fillna(data1[feature].mode()[0])
    

In [None]:
data1.isna().sum().sort_values(ascending=False)

 # One OnHot Encoding on Categorical Features

In [None]:
data1=pd.get_dummies(data1,drop_first=True)
data1.head()

In [None]:
data1.info()

# Building Prediction Model¶

### Model 1: Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge,RidgeCV, LassoCV,Lasso
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
X =data1.drop('SalePrice', axis = 1)
y= data1['SalePrice']

In [None]:
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3,  random_state=42)

In [None]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)


In [None]:
y_pred_lin=regressor.predict(X_test)
regressor.score(X_train, y_train, sample_weight=None)

In [None]:
abs_err_lin_reg= abs(y_test -y_pred_lin)

In [None]:
#Absolute error mean, median, sd, IQR, max, min
from scipy.stats import iqr
model_comp = pd.DataFrame({'Mean of AbsErrors':    abs_err_lin_reg.mean(),
                           'Median of AbsErrors' : abs_err_lin_reg.median(),
                           'SD of AbsErrors' :     abs_err_lin_reg.std(),
                           'IQR of AbsErrors':     iqr(abs_err_lin_reg),
                           'Min of AbsErrors':     abs_err_lin_reg.min(),
                           'Max of AbsErrors':     abs_err_lin_reg.max()}, index = ['lin_reg'])
model_comp

In [None]:
#Actual vs. Prediction
plt.scatter(x = y_test, y = y_pred_lin)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction')

#Add 45 degree line
xp = np.linspace(y_test.min(), y_test.max(), 100)
plt.plot(xp, xp, 'k', alpha = 0.9, linewidth = 2, color = 'red')

# Model 2: Lasso Regression

In [None]:
lambda_grid = 10 ** np.linspace(1, -3, 100)

In [None]:
#K-fold Cross Validation to Choose the Best Model
lassocv = LassoCV(alphas = lambda_grid, cv = 10, normalize = True)
lassocv.fit(X_train, y_train)
lassocv.alpha_

In [None]:
#Best Model Coefs:
lassocv.coef_


In [None]:
lassoreg = Lasso(alpha = lassocv.alpha_, normalize = True)
lassoreg.fit(X_train, y_train)
pred_lasso = lassoreg.predict(X_test)


In [None]:
#Absolute error
abs_err_lasso = abs(y_test - pred_lasso)
abs_err_lasso.mean() 

In [None]:
#Absolute error mean, median, sd, IQR, max, min
from scipy.stats import iqr
model_comp = model_comp.append(pd.DataFrame({'Mean of AbsErrors':    abs_err_lasso.mean(),
                           'Median of AbsErrors' : abs_err_lasso.median(),
                           'SD of AbsErrors' :     abs_err_lasso.std(),
                           'IQR of AbsErrors':     iqr(abs_err_lasso),
                           'Min of AbsErrors':     abs_err_lasso.min(),
                           'Max of AbsErrors':     abs_err_lasso.max()}, index = ['Lasso Reg']),ignore_index = False)

model_comp

In [None]:
#Actual vs. Prediction
plt.scatter(x = y_test, y = pred_lasso)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction')

#Add 45 degree line
xp = np.linspace(y_test.min(), y_test.max(), 100)
plt.plot(xp, xp, alpha = 0.9, linewidth = 2, color = 'red')

In [None]:
lassoreg.score(X_test, y_test, sample_weight=None)

# Model 3: Ridge Regression

In [None]:
lambda_grid = 10 ** np.linspace(5, -2, 100)

In [None]:
#K-fold Cross Validation to Choose the Best Model
ridgecv = RidgeCV(alphas = lambda_grid, cv = 10, normalize = True)
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

In [None]:
#Best Model Coefs:
#ridgecv.coef_

In [None]:
ridgereg = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridgereg.fit(X_train, y_train)
pred_ridge = ridgereg.predict(X_test)


In [None]:
#Absolute error
abs_err_ridge = abs(y_test - pred_ridge)

In [None]:
#Absolute error mean, median, sd, IQR, max, min
from scipy.stats import iqr
model_comp = model_comp.append(pd.DataFrame({'Mean of AbsErrors':    abs_err_ridge.mean(),
                                             'Median of AbsErrors' : abs_err_ridge.median(),
                                             'SD of AbsErrors' :     abs_err_ridge.std(),
                                             'IQR of AbsErrors':     iqr(abs_err_ridge),
                                             'Min of AbsErrors':     abs_err_ridge.min(),
                                             'Max of AbsErrors':     abs_err_ridge.max()}, index = ['Ridge Reg']), 
                               ignore_index = False)

model_comp

In [None]:
#Actual vs. Prediction
plt.scatter(x = y_test, y = pred_ridge)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction')

#Add 45 degree line
xp = np.linspace(y_test.min(), y_test.max(), 100)
plt.plot(xp, xp, 'k', alpha = 0.9, linewidth = 2, color = 'red')

#### you liked this Notebook, please do upvote.

#### If you have any questions, feel free to comment!

 #### Thank you.