In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

## Step 1: Import and Inspect Dataset

In [None]:
housing = pd.read_csv("train.csv")
housing.head()

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
housing.info()

In [None]:
housing.isnull().sum()/housing.shape[0] * 100

## Step 2: Data Cleaning

In [None]:
cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
for i in cols:
    housing[i].fillna("None",inplace = True)

In [None]:
housing.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize =[6,6])
sns.distplot(housing['SalePrice'])
plt.show()

### We can see that the target variable SalePrice is right skewed

In [None]:
print("Skewness: ", housing['SalePrice'].skew())
print("Kurtosis: ", housing['SalePrice'].kurt())

##### We can observe that the target variable has skewness greater than 1 and has hog density around saleprice of 160000

##### Hence, we can do data transformation for this variable

In [None]:
#Log Transformation
housing['SalePrice'] = np.log(housing['SalePrice'])

In [None]:
plt.figure(figsize =[6,6])
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
print("Skewness: ", housing['SalePrice'].skew())
print("Kurtosis: ", housing['SalePrice'].kurt())

### We can now see that skewness and kutosis is removed

- Drop ID Column
- Convert 'MSSubClass', 'OverallQual', 'OverallCond' to object datatype
- Convert 'LogFrontage', 'MasVnrArea' to Numeric Data Types

In [None]:
housing.drop("Id",axis=1, inplace=True)

In [None]:
housing[['MSSubClass', 'OverallQual', 'OverallCond']] = housing[['MSSubClass', 'OverallQual', 'OverallCond']].astype('object')

In [None]:
housing['LotFrontage'] = pd.to_numeric(housing['LotFrontage'], errors = 'coerce')
housing['MasVnrArea'] = pd.to_numeric(housing['MasVnrArea'], errors = 'coerce')

In [None]:
housing.info()

In [None]:
null_cols = housing.columns[housing.isnull().any()]
null_cols

In [None]:
for i in null_cols:
    if (housing[i].dtype == np.float64 or housing[i].dtype == np.int64):
        housing[i].fillna(housing[i].mean(), inplace = True)
    else:
        housing[i].fillna(housing[i].mode()[0], inplace = True)

In [None]:
housing.isnull().sum()

## Step 3: Exploratory Data Analysis on Dataset

In [None]:
# list of categorical Columns
cat_cols = housing.select_dtypes(include = 'object').columns
cat_cols

In [None]:
# List of numerical columns
num_cols = housing.select_dtypes(include = ['int64','float64']).columns
num_cols

## Univariant analysis

In [None]:
# Numerical Columns
#Plotting Boxplots to visualize the distribution and check for outliers
for i in num_cols:
    plt.figure(figsize=[8,5])
    print(i)
    sns.boxplot(housing[i])
    plt.show()

#### We can see outliers in LotFrontage, LotArea, YearBuilt, MaxVnrArea,BsmtFinSF1, BsmtUnfSF, etc...

In [None]:
#Categorical Columns
# Plotting Pie plots to visualize the values distribution in each category
for i in cat_cols:
    print(housing[i].value_counts(normalize=True))
    plt.figure(figsize=[5,5])
    housing[i].value_counts(normalize=True).plot.pie(labeldistance = None, autopct = '%1.2f%%')
    plt.legend()
    plt.show()
    print("-----------------------------------------------------------------------------")

##### We can look percentage of values in category of columns and infer that, 'MSZoning', 'Street', 'LandContour', 'Utilities',LotConfig, LandSlope, Condition1, Condition2, BlgdTypre, RoofStyle, etc., columns are having more than 70% of the distribution in a single category

##### Bivariate/ Multivariate Analysis on the Dataset

In [None]:
#Plot of MSZoning vs LotFrontage
sns.barplot(x='MSZoning', y='LotFrontage', data= housing)
plt.show

In [None]:
#Plot of MSSubClass vs LotFrontage
sns.barplot(x='MSSubClass', y='LotFrontage', data= housing)
plt.show

In [None]:
# plot of HouseStyle vs SalePrice based on Street
sns.barplot(x='HouseStyle', y='SalePrice',hue= 'Street', data = housing)
plt.show

In [None]:
# Plot of BldgType vs SalePrice
sns.barplot(x='BldgType', y='SalePrice', data=housing)
plt.show()

In [None]:
# Plot of BsmtQual vs SalePrice
sns.barplot(x='BsmtQual', y='SalePrice', data=housing)
plt.show()

In [None]:
# Calculating Age of the Property
housing["Age"] = housing["YrSold"] - housing["YearBuilt"]
housing["Age"].head()

In [None]:
# Dropping YrSold and YearBuilt
housing.drop(columns=['YrSold','YearBuilt'], axis=1, inplace=True)

In [None]:
housing.head()

In [None]:
plt.figure(figsize = [25,25])
sns.heatmap(housing.corr(),annot=True, cmap= 'BuPu')
plt.title("Correlation of Numeric Values")

In [None]:
k =10
plt.figure(figsize=[15,15])
cols = housing.corr().nlargest(k,"SalePrice").index
cm = np.corrcoef(housing[cols].values.T)
sns.heatmap(cm,annot=True,square=True, fmt='.2f', cbar = True, annot_kws={'size':10},yticklabels=cols.values,xticklabels=cols.values)
plt.show()

##### Pairplot for Numerical Columns

In [None]:
cols = ["SalePrice","OverallQual","GrLivArea","GarageCars","TotalBsmtSF","Age"]
plt.figure(figsize=[20,20])
sns.pairplot(housing[cols])
plt.show()

## Step 4: Data Preparation

In [None]:
housing_num = housing.select_dtypes(include =['int64','float64'])
housing_cat = housing.select_dtypes(include = 'object')

In [None]:
housing_cat

In [None]:
housing_cat_dm = pd.get_dummies(housing_cat, drop_first=True, dtype=int)

In [None]:
housing_cat_dm

In [None]:
house = pd.concat([housing_num,housing_cat_dm], axis=1)

In [None]:
house.head()

In [None]:
house.shape

In [None]:
#Split into Target and feature variables
X = house.drop(["SalePrice"],axis=1).copy()
y = house["SalePrice"].copy()

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42) 

In [None]:
X_train.shape

In [None]:
y_train.shape

##### scaling the dataset with Standard Scaler

In [None]:
num_cols = list(X_train.select_dtypes(include=['int64','float64']).columns)

In [None]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.fit_transform(X_test[num_cols])

##### Building a function to calculate evaluation metrics

In [None]:
def eval_metrics(y_train, y_train_pred, y_test, y_pred):
    
    #r2 values for train and test data
    print("r2 score (train) = ", '%.2f' % r2_score(y_train, y_train_pred))
    print("r2 score (test) = ", "%.2f" % r2_score(y_test, y_pred))
    
    ## RMSE for train and test data
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_train = mse_train ** 0.5
    rmse_test = mse_test ** 0.5
    
    print("RMSE(Train) = ", "%.2f" % rmse_train)
    print("RMSE(Test) = ", "%.2f" % rmse_test)    

## Step 5: Build ML Model

In [None]:
# Import ML Libraries
import sklearn
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
# Applying Ridge regression with varying the hyperparameter 'lambda'
params = {'alpha':
             [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0
             ,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
ridge = Ridge()
ridgeCV = GridSearchCV(estimator=ridge, param_grid = params, scoring="neg_mean_absolute_error", cv = 5, return_train_score=True, verbose=1, n_jobs=-1)
ridgeCV.fit(X_train, y_train)

In [None]:
ridgeCV.best_params_

In [None]:
ridgeCV.cv_results_

In [None]:
ridge = Ridge(alpha=9)

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge.coef_

In [None]:
y_train_pred = ridge.predict(X_train)
y_pred = ridge.predict(X_test)

In [None]:
eval_metrics(y_train, y_train_pred, y_test, y_pred)

In [None]:
ridgeCV_res = pd.DataFrame(ridgeCV.cv_results_)
ridgeCV_res.head()

In [None]:
plt.plot(ridgeCV_res['param_alpha'], ridgeCV_res['mean_train_score'], label = 'train')
plt.plot(ridgeCV_res['param_alpha'], ridgeCV_res['mean_test_score'], label = 'test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# Applying Lasso regression with varying the hyperparameter 'lambda'
params = {'alpha':
             [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0
             ,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
lasso = Lasso()
lassoCV = GridSearchCV(estimator=lasso, param_grid = params, scoring="neg_mean_absolute_error", cv = 5, return_train_score=True, verbose=1, n_jobs=-1)
lassoCV.fit(X_train, y_train)

In [None]:
lassoCV.best_params_

In [None]:
lasso = Lasso(alpha = 0.0001)

In [None]:
lasso.fit(X_train, y_train)

In [None]:
lasso.coef_

In [None]:
y_train_pred1 = lasso.predict(X_train)
y_pred1 = lasso.predict(X_test)

In [None]:
eval_metrics(y_train, y_train_pred1, y_test, y_pred1)

In [None]:
lassoCV_res = pd.DataFrame(lassoCV.cv_results_)
lassoCV_res.head()

In [None]:
plt.plot(lassoCV_res['param_alpha'], lassoCV_res['mean_train_score'], label = 'train')
plt.plot(lassoCV_res['param_alpha'], lassoCV_res['mean_test_score'], label = 'test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

### Feature Extraction/Elimination

In [None]:
betas = pd.DataFrame(index=X.columns) # Convert the columns to a Dataframe as betas
betas.rows = X.columns
# Creating columns for Ridge and lasso coefficients against each feature
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_
betas

In [None]:
# View the features removed by Lasso
lasso_cols_removed = list(betas[betas['Lasso']==0].index)
print(lasso_cols_removed)

In [None]:
# View the features selected by Lasso
lasso_cols_selected = list(betas[betas['Lasso']!=0].index)
print(lasso_cols_selected)

In [None]:
print(len(lasso_cols_removed)) # 179 features are removed by Lasso
print(len(lasso_cols_selected)) # 107 features are selected by Lasso

##### Top 10 features significant in predicting the value of a house, both according to Ridge model and Lasso model

In [None]:
# View the top 10 coefficients of Ridge Regression in descending order
betas['Ridge'].sort_values(ascending=False)[:10]

In [None]:
# View the top 10 coefficients of Lasso in descending order
betas['Lasso'].sort_values(ascending=False)[:10]

In [None]:
# We have to take inverse log of betas to interpret the ridge coefficients in terms of target variable
lasso_coeffs = np.exp(betas['Lasso'])
lasso_coeffs.sort_values(ascending=False)[:10]