#### Install Required Library Packages

In [None]:
!pip install pytest-warnings
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip freeze

In [2]:
#import the warning
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

### Step 1 : Import and Inspect Dataset

In [None]:
housing=pd.read_csv("train.csv")
housing.head()

In [5]:
housing.shape

(1460, 81)

In [None]:
housing.describe()

In [None]:
housing.info()

In [None]:
housing.isnull().sum()/housing.shape[0]*100

### Step 2 : Data Cleaning

In [9]:
cols=["Alley","BsmtQual","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu",
      "GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"]
for i in cols:
    housing[i].fillna("None",inplace=True)

In [None]:
housing.info()

In [11]:
#import Visualisation libs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#checking target variable "SalePrice" is normal distributed
plt.figure(figsize=[6,6])
sns.displot(housing["SalePrice"])
plt.show()

##### Target Variable SalePrice is Right-Skewed

In [None]:
print("Skewnwss: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

##### We can observe that target variable has skewness greater than 1 and has high density around saleprice of 160000.
##### Hence We can do Data Transformation for this variable

In [14]:
#log transformation
housing['SalePrice']=np.log(housing['SalePrice'])

In [None]:
#checking target variable "SalePrice" after log transformation
plt.figure(figsize=[6,6])
sns.displot(housing["SalePrice"])
plt.show()

In [None]:
print("Skewnwss: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

##### We can see a Normal Distribution and skewness & Kutosis are reduced

- Drop Id column
- Convert 'MSSubClass','OverallQual','OverallCond' to object   datatype
- Convert 'LogFrontage','MasVnrArea' to numeric datatype

In [17]:
housing.drop("Id",axis=1,inplace=True)

In [18]:
housing[['MSSubClass','OverallQual','OverallCond']]=housing[['MSSubClass','OverallQual','OverallCond']].astype('object')

In [19]:
housing['LotFrontage']=pd.to_numeric(housing['LotFrontage'],errors='coerce')
housing['MasVnrArea']=pd.to_numeric(housing['MasVnrArea'],errors='coerce')

In [None]:
housing.info()

In [None]:
null_cols = housing.columns[housing.isnull().any()]
null_cols

In [22]:
for i in null_cols:
    if housing[i].dtype==np.float64 or housing[i].dtype==np.int64:
        housing[i].fillna(housing[i].mean(),inplace=True)
    else:
        housing[i].fillna(housing[i].mode()[0],inplace=True)

In [None]:
housing.isna().sum()

##### We have handled Null Values

### Step 3 : Exploratory Data Analysis

In [None]:
# list of categorical columns
cat_cols=housing.select_dtypes(include="object").columns
cat_cols

In [None]:
# list of numerical columns
num_cols=housing.select_dtypes(include=["int64","float64"]).columns
num_cols

#### Univariate Analysis

In [None]:
# Numerical Columns
#Plotting box plots to visualise the distribution and check for any outliers
for i in num_cols:
    plt.figure(figsize=[8,5])
    sns.boxplot(housing[i])
    plt.show()
    print("------------------------------------------------------------------------------------------------------------------")

##### We can see outliers in LotFrontage, LotArea, YearBuilt, MaxVnrArea, BsmtFinSF1, etc..

In [None]:
# Categorical Columns
# Plotting Pie plots to visualise distribution in each category
for i in cat_cols:
    print(housing[i].value_counts(normalize=True))
    plt.figure(figsize=[5,5])
    housing[i].value_counts(normalize=True).plot.pie(labeldistance=None,autopct='%1.2f%%')
    plt.legend()
    plt.show()
    print("-------------------------------------------------------------------------------------------------------------")

##### We can look Percentage values in category of columns and infer that, 'MSZoning','Street','LandContour','Utilities','LotConfig',etc.. Columns are having more than 70% of a distribution in a single category

#### BiVariate/MultiVariate Analysis

In [None]:
#plot of MSZoning vs LotFrontage
sns.barplot(x='MSZoning',y='LotFrontage',data=housing)
plt.show()

In [None]:
#plot of MSSubClass vs LotFrontage
sns.barplot(x='MSSubClass',y='LotFrontage',data=housing)
plt.show()

In [None]:
#plot of HouseStyle vs SalePrice
sns.barplot(x='HouseStyle',y='SalePrice',hue='Street',data=housing)
plt.show()

In [None]:
#plot of BldgType vs SalePrice
sns.barplot(x='BldgType',y='SalePrice',data=housing)
plt.show()

In [None]:
#plot of BsmtQual vs SalePrice
sns.barplot(x='BsmtQual',y='SalePrice',data=housing)
plt.show()

##### Conclusion
- We can see that RL has the highest lot frontage and RM has the least
- We can see that 2Story 1946 & Newer has the highest lot frontage and PUD-Multilevel-Incl split lewfoyer has the least
- The SalePrice is not showing much variance with respect to Style of dwelling
- The SalePrice is almost same for all the Building types and the basement quality. So there is no significant pattern

In [None]:
# Calculating Age of the Property
housing['Age']=housing['YrSold']-housing['YearBuilt']
housing['Age'].head()

In [34]:
# Dropping YrSold and YearBuilt
housing.drop(columns=["YrSold","YearBuilt"],axis=1,inplace=True)

In [None]:
housing.head()

##### Correlation between Numerical Columns

In [None]:
plt.figure(figsize=[25,25])
sns.heatmap(housing.corr(numeric_only=True),annot=True,cmap='BuPu')
plt.title("Correlation of Numerical Values")
plt.show()

##### Get top 10 correlated columns

In [None]:
k=10
plt.figure(figsize=[15,15])
cols=housing.corr(numeric_only=True).nlargest(k,"SalePrice").index
cm=np.corrcoef(housing[cols].values.T)
sns.heatmap(cm,annot=True,square=True,fmt='.2f',cbar=True,annot_kws={'size':10},
            yticklabels=cols.values,xticklabels=cols.values)
plt.show()

##### We Can see that
- GarageArea and GarageCars are highly correlated with coeff of 0.88
- GrLivArea and TotResAbvGrd are highly correlated with coeff of 0.83
- TotalBsmtSF and FirstFlrSF are highly correlated with coeff of 0.82

#### Pairplot for Numeric Columns

In [None]:
cols=['SalePrice','OverallQual','GrLivArea','FullBath','GarageCars','TotalBsmtSF',"Age"]
plt.figure(figsize=[20,20])
sns.pairplot(housing[cols])
plt.show()

We can see that Age has negative correlation with target variable SalePrice and TotalBsmtSF,GrLivArea have positive correlation with SalePrice

### Step 4 : Data Preparation

##### Dummy Encoding

In [39]:
housing_num=housing.select_dtypes(include=['int64','float64'])
housing_cat=housing.select_dtypes(include=['object'])

In [None]:
housing_cat

In [None]:
housing_cat_dm=pd.get_dummies(housing_cat,drop_first=True,dtype=int)
housing_cat_dm

In [None]:
house=pd.concat([housing_num,housing_cat_dm],axis=1)
house.head()

In [None]:
house.shape

In [None]:
# split into target and feature variables
x=house.drop(['SalePrice'],axis=1).copy()
y=house['SalePrice'].copy()
x.head()
# y.head()

In [None]:
y.head()

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [47]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
x_train.shape

In [None]:
y_train.shape

#### Scaling the dataset with standard scalar

In [50]:
nums_cols=list(x_train.select_dtypes(include=['int64','float64']).columns)

In [51]:
scaler=StandardScaler()
x_train[nums_cols]=scaler.fit_transform(x_train[nums_cols])
x_test[nums_cols]=scaler.fit_transform(x_test[nums_cols])

Building a function to calculate evaluation metrics

In [52]:
def eval_metrics(y_train,y_train_pred,y_test,y_pred):
    #r^2 values for train and test data
    print("r2 score (train) =","%.2f" %r2_score(y_train,y_train_pred))
    print("r2 score (test) =","%.2f" %r2_score(y_test,y_pred))
    
    #RMSE for train and test data
    mse_train=mean_squared_error(y_train,y_train_pred)
    mse_test=mean_squared_error(y_test,y_pred)
    rmse_train=mse_train ** 0.5
    rmse_test=mse_test ** 0.5
    print("RMSE(train) = ",'%.2f' %rmse_train)
    print("RMSE(test) = ",'%.2f' %rmse_test)

### Step 5 : Building ML Model

In [53]:
# import ML Libs
import sklearn
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
# Applying Ridge regressing with varying hyperparameter 'Lambda' 
params={'alpha':
          [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,
                2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
ridge=Ridge()
ridgeCV=GridSearchCV(estimator=ridge,param_grid=params,scoring='neg_mean_absolute_error',cv=5,return_train_score=True,verbose=1,n_jobs=-1)
ridgeCV.fit(x_train,y_train)

In [None]:
ridgeCV.best_params_

In [None]:
ridgeCV.cv_results_

In [57]:
ridge=Ridge(alpha=9)

In [None]:
ridge.fit(x_train,y_train)

In [None]:
ridge.coef_

In [60]:
y_train_pred = ridge.predict(x_train)
y_pred = ridge.predict(x_test)

In [None]:
eval_metrics(y_train,y_train_pred,y_test,y_pred)

In [None]:
ridgeCV_res=pd.DataFrame(ridgeCV.cv_results_)
ridgeCV_res.head()

In [None]:
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_train_score'],label='train')
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_test_score'],label='test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# Applying Lasso regressing with varying hyperparameter 'Lambda' 
params={'alpha':
          [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,
                2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
lasso=Lasso()
lassoCV=GridSearchCV(estimator=lasso,param_grid=params,scoring='neg_mean_absolute_error',cv=5,return_train_score=True,verbose=1,n_jobs=-1)
lassoCV.fit(x_train,y_train)

In [None]:
lassoCV.best_params_

In [66]:
lasso=Lasso(alpha=0.0001)

In [None]:
lasso.fit(x_train,y_train)

In [None]:
lasso.coef_

In [None]:
y_train_pred1 = lasso.predict(x_train)
y_pred1 = lasso.predict(x_test)
eval_metrics(y_train,y_train_pred1,y_test,y_pred1)

In [None]:
lassoCV_res=pd.DataFrame(lassoCV.cv_results_)
lassoCV_res.head()

In [None]:
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_train_score'],label='train')
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_test_score'],label='test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

##### Feature Extraction/Elimination

In [None]:
betas=pd.DataFrame(index=x.columns) #convert the columns to a dataframe as betas
betas.rows=x.columns
#creating columns for Ridge and Lasso coefficients against each feature
betas['Ridge']=ridge.coef_
betas['Lasso']=lasso.coef_
betas

In [None]:
# view the features removed by lasso
lasso_cols_removed=list(betas[betas['Lasso']==0].index)
print(lasso_cols_removed)

In [None]:
# view the features selected by lasso
lasso_cols_selected=list(betas[betas['Lasso']!=0].index)
print(lasso_cols_selected)

In [None]:
print(len(lasso_cols_removed))
print(len(lasso_cols_selected))

##### Top 10 features significant in predicting the value of a house, both according to Ridge and Lasso Model

In [None]:
#view the top 10 coefficients of Ridge regression in descending order
betas['Ridge'].sort_values(ascending=False)[:10]

In [None]:
#we have to take inverse log of betas to interpret the ridge coefficients in terms of target variable
ridge_coeffs=np.exp(betas['Ridge'])
ridge_coeffs.sort_values(ascending=False)[:10]

In [None]:
#view the top 10 coefficients of Lasso regression in descending order
betas['Lasso'].sort_values(ascending=False)[:10]

In [None]:
#we have to take inverse log of betas to interpret the lasso coefficients in terms of target variable
ridge_coeffs=np.exp(betas['Lasso'])
ridge_coeffs.sort_values(ascending=False)[:10]

##### Few Inferences are :- 
- Therefore the price of the house will increase by 1.11 with the increase in GrLivArea
- The price of house can increase by 1.08 times if the finish of the house is very good
- If the house has centralised AC the price can increase by 1.08 times
- If the basement condition is typical then the house price may increase upto 1.06 times
- The Price of the house may increase if the neighbourhood has Crawford, Stone Brook and Northridge heights as Physical locations

### Optimal Value of Lambda for Ridge Regression = 9
### Optimal Value of Lambda for Lasso Regression = 0.001