# Project: House Prices

#### Goal: Apply  analytics to predict the sales price for each house. For each Id in the test set,  must predict the value of the SalePrice variable. 

## Initial Setup

In [None]:
# Required libraries
%pip install numpy
%pip install pandas
%pip install matplotlib.pyplot
%pip install statsmodels.api

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [None]:
#Read data from file
data=pd.read_csv('train.csv')
data.head(5)

In [None]:
data.shape

# Exploratory Data Analysis Check List

#### 1: Understand data collection process

#### 2: Document data set description (meta data)

#### 3: Check for missing values

#### 4: Univariate data analysis

#### 5: Bivariate data analysis

#### 6: Inferential Statistics (optional)

# Data understanding- Phase1 

### 1: Understand data collection process

#### Ref:Anna Montoya, DataCanary. (2016). House Prices - Advanced Regression Techniques. Kaggle. https://kaggle.com/competitions/house-prices-advanced-regression-techniques


### 2: Document Data Set Description (Meta Data)

In [None]:
data.info()

### 3: Check for Missing Values

In [None]:
#Step 1: Determine the type of MVs
#Know the cause
np.sum(data.isnull())

In [None]:
# Step 2: Determine the extent of MVs
# Summary of MVs in each column
mvs_summary = pd.DataFrame({'freq':np.sum(data.isnull())})
mvs_summary['pct'] = round(mvs_summary['freq']/data.shape[0]* 100, 1)
mvs_summary.sort_values(by = 'pct', ascending = False)

In [None]:
# Summary of MVs for each case
data.loc[:,'mvs']=data.apply(lambda row:np.sum(row.isnull()), axis = 1)
data.sort_values(by='mvs', ascending = False)
data.loc[:,'mvs_pct']=round( data.apply(lambda row:np.sum(row.isnull())/(data.shape[1]-3)*100,axis = 1),1)
#(data.shape[1]-3), 3 refers to 3 extra columns (id, mvs, mvs-pct)
data.sort_values(by='mvs',ascending = False)

##### #Decision: Modify elements that are known as null but still contain  information (Descriptive Statistics).

In [None]:
data[['MasVnrArea','MasVnrType']]

In [None]:
 # Modifying'MasVnrType'
data.loc[data['MasVnrArea']==0].shape[0]

In [None]:
data.loc[data['MasVnrArea'] == 0,'MasVnrType'] ='none'
np.sum(data['MasVnrType'].isnull())

In [None]:
np.sum(data['MasVnrArea'].isnull())

In [None]:
data.loc[data['MasVnrArea'].isnull()]

In [None]:
data.loc[data['MasVnrType'].isnull()]

In [None]:
np.sum(data['MasVnrType']=='none')

In [None]:
 data[['MasVnrArea','MasVnrType']]

In [None]:
# Modifying'FireplaceQu'
data.loc[data['Fireplaces']==0,'FireplaceQu']='NA'

In [None]:
np.sum(data['Fireplaces']==0)

In [None]:
np.sum(data['FireplaceQu']=='NA')

In [None]:
np.sum(data['FireplaceQu'].isnull())

In [None]:
# Modifying'PoolQC'
data.loc[data['PoolArea']==0,'PoolQC']='NA'

In [None]:
np.sum(data['PoolArea']==0)

In [None]:
np.sum(data['PoolQC']=='NA')

In [None]:
np.sum(data['PoolQC'].isnull())

In [None]:
# data[['PoolArea', 'PoolQC']]

In [None]:
np.sum(data['PoolArea']!=0)

In [None]:
np.sum(data['PoolQC']!='NA')

In [None]:
data.loc[(data['PoolArea']!= 0)]

In [None]:
data.loc[(data['PoolArea']!= 0)&(data['PoolQC']=='NA'),'PoolQC']=np.nan

In [None]:
data.loc[data['PoolArea' ]!= 0]

In [None]:
data.loc[1298, 'PoolQC']=np.nan

In [None]:
np.sum(data['PoolQC'].isnull())

In [None]:
# data['PoolQC']

In [None]:
data.loc[data['PoolQC'].isnull()]

In [None]:
data.loc[1298,'PoolQC']

In [None]:
# Modifying'Fence'
np.sum(data['Fence'].isnull())

In [None]:
data.loc[data['Fence'].isnull(),'Fence']='NA'

In [None]:
np.sum(data['Fence'].isnull())

In [None]:
# data['Fence']

In [None]:
# Modifying 'MiscFeature'
np.sum(data['MiscFeature'].isnull())

In [None]:
data.loc[data['MiscFeature'].isnull(),'MiscFeature']='NA'

In [None]:
np.sum(data['MiscFeature'].isnull())

In [None]:
# Modifying 'GarageType','GarageYrBlt','GarageFinish','GarageQual'and 'GarageCond'
np.sum(data['GarageArea']==0)

In [None]:
data.loc[data['GarageArea']==0,['GarageType','GarageFinish','GarageQual','GarageCond']]='NA'
data.loc[data['GarageArea']==0,['GarageYrBlt']]=0
data[['GarageArea','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond']]

In [None]:
np.sum(data['GarageType'].isnull())

In [None]:
np.sum(data['GarageYrBlt'].isnull())

In [None]:
np.sum(data['GarageFinish'].isnull())

In [None]:
np.sum(data['GarageQual'].isnull())

In [None]:
np.sum(data['GarageCond'].isnull())

In [None]:
# Modifying'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2'
np.sum(data['TotalBsmtSF']==0)

In [None]:
data.loc[data['TotalBsmtSF']==0,['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']]='NA'
data[['TotalBsmtSF','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']]

In [None]:
np.sum(data['BsmtQual'].isnull())

In [None]:
np.sum(data['BsmtCond'].isnull())

In [None]:
np.sum(data['BsmtExposure'].isnull())

In [None]:
data.loc[data['BsmtExposure'].isnull()]

In [None]:
np.sum(data['BsmtFinType1'].isnull())

In [None]:
np.sum(data['BsmtFinType2'].isnull())

In [None]:
data.loc[data['BsmtFinType2'].isnull()]

In [None]:
# Modifying'LotFrontage'
np.sum(data['LotFrontage'].isnull())

In [None]:
# data['LotFrontage']

In [None]:
np.sum(data['LotFrontage']==0)

In [None]:
data.loc[data['LotFrontage'].isnull(),'LotFrontage']=0
# data['LotFrontage']

In [None]:
np.sum(data['LotFrontage'].isnull())

In [None]:
np.sum(data['LotFrontage']==0)

In [None]:
# Modifying 'Alley'
np.sum(data['Alley'].isnull())

In [None]:
data.loc[data['Alley'].isnull(),'Alley']='NA'
# data['Alley']

In [None]:
np.sum(data['Alley'].isnull())

In [None]:
np.sum(data['Alley']=='NA')

In [None]:
#Step 2: Determine the extent of MVs after Modifying
#Summary of MVs in each column after Modifying 
mvs_summary=pd.DataFrame({'freq':np.sum(data.isnull())})
mvs_summary['pct']=round(mvs_summary['freq']/data.shape[0]*100,1)
mvs_summary.sort_values(by='pct',ascending = False)

In [None]:
# Summary of MVs for each case after Modifying 
data.loc[:,'mvs']=data.apply(lambda row: np.sum(row.isnull()),axis = 1)
data.sort_values(by='mvs',ascending = False)
data.loc[:,'mvs_pct'] =round(data.apply(lambda row: np.sum(row.isnull())/(data.shape[1]-3)*100, axis = 1),1)
#(data.shape[1]-3), 3 refers to 3 extra columns (id, mvs, mvs-pct)
data.sort_values(by='mvs',ascending = False)

In [None]:
data['mvs_pct'].max()

In [None]:
# Step 3: Diagnose the randomness of the missing values processes
# Given the low percentage of missing values in columns and records (in a large dataset),
# It is acceptable to proceed with a simple imputation strategy without a detailed evaluation of whether the missing values are MAR. 
# This imputation will not significantly impact the overall results.

In [None]:
np.sum(data['MasVnrArea'].isnull())

In [None]:
float_columns=data.select_dtypes(include=['float64'])
float_columns

In [None]:
# Step 4: Select the imputation method
# Method: mean substitution
# Substiude missing values in numeric column ('MasVnrArea') with the mean of column.
data.loc[:,'MasVnrArea']=data.loc[:,'MasVnrArea'].fillna(data.loc[:,'MasVnrArea'].mean())
print(data.shape)
np.sum(data['MasVnrArea'].isnull())

In [None]:
# Method: mode substitution
# Substiude missing values in categorical columns with the mode of each column.
categorical_columns = ['MasVnrType','BsmtExposure','BsmtFinType2','Electrical','PoolQC']

for i in categorical_columns:
    mode_value=data[i].mode()[0]
    data[i]=data[i].fillna(mode_value)

print(data.shape)
np.sum(data[['MasVnrType','BsmtExposure','BsmtFinType2','Electrical','PoolQC']].isnull())


In [None]:
# np.sum(data.isnull())

In [None]:
data.info()

In [None]:
# Convert the column from float to int
data['LotFrontage']=data['LotFrontage'].astype('int64')
data['GarageYrBlt']=data['GarageYrBlt'].astype('int64')
data['MasVnrArea']=data['MasVnrArea'].astype('int64')

data.info()

In [None]:
data.loc[data['YearRemodAdd'] < data['YearBuilt'],['YearRemodAdd','YearBuilt']]

In [None]:
data.loc[data['MasVnrArea'] < 0,'MasVnrArea']

### 4: Univariate data analysis

In [None]:
data['Id'].nunique()

In [None]:
np.sum(data.duplicated())

In [None]:
var_ind_num1=[3,4,19,20,26,34,36,37,38]+ list(range(43,53))+[54,56,59,61,62]+list(range(66,72))+list(range(75,78))+[80]
var_ind_cat1=[1,2]+ list(range(5,19))+list(range(21,26))+list(range(27,34))+[35]+list(range(39,43))+[53,55,57,58,60,63,64,65,72,73,74,78,79]

In [None]:
# Histogram of numeric variables
plt.figure(figsize=(20,20))
plt.subplots_adjust(hspace =0.3, wspace =0.3)
for i in range(1,35):
    plt.subplot(7,5,i)
    plt.hist(x = data.iloc[: ,var_ind_num1[i- 1]], 
             alpha = 0.4, color = 'red')
    plt.title(data.columns[var_ind_num1[i - 1]])
plt.show()

In [None]:
#summary statistics of numeric variables
data.iloc[:,var_ind_num1].describe()

In [None]:
# Calculate frequency and percentage
for i in var_ind_cat1: 
    print('--- Frequency Table of '+data.columns[i] +' ---')
    frq_summary=pd.DataFrame({'category':(data.iloc[:, i].value_counts()).index,'freq':data.iloc[:, i].value_counts().values})
    frq_summary['pct']=round(frq_summary['freq']/data.shape[0]* 100, 1)
    frq_summary.sort_values(by ='pct',ascending=False)
    print(frq_summary)

In [None]:
len(var_ind_cat1)

In [None]:
len(var_ind_num1)

In [None]:
len(var_ind_cat1)+len(var_ind_num1)

In [None]:
data.head(4)

# Data Prepration

In [None]:
# Remove added coloumns
data.drop(columns=['mvs','mvs_pct'],inplace=True)    
data.info()

### Convert Ordinal Variables in to Numeric (Label Encoding)

In [None]:
# Convert 'LotShape' into numeric variable
data['cnvrt_LotShape']=data['LotShape'].replace(['Reg','IR1','IR2','IR3'],
                                                 list(range(4, 0,-1)),inplace = False)
#Check the results
print(np.sum(data['cnvrt_LotShape'].isnull()))
data['cnvrt_LotShape'].describe()
# data['cnvrt_LotShape']

In [None]:
# Convert 'LandSlope' into numeric variable
data['cnvrt_LandSlope']=data['LandSlope'].replace(['Gtl','Mod','Sev'],
                                                 list(range(3, 0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_LandSlope'].isnull()))
data['cnvrt_LandSlope'].describe()
# data['cnvrt_LandSlope']

In [None]:
# Convert 'ExterQua' into numeric variable
data['cnvrt_ExterQual']=data['ExterQual'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5, 0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_ExterQual'].isnull()))
data['cnvrt_ExterQual'].describe() 
# data['cnvrt_ExterQual']

In [None]:
# Convert 'ExterCond' into numeric variable
data['cnvrt_ExterCond']=data['ExterCond'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5, 0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_ExterCond'].isnull()))
data['cnvrt_ExterCond'].describe() 
# data['cnvrt_ExterCond']

In [None]:
# Convert 'BsmtQual' into numeric variable
data['cnvrt_BsmtQual']=data['BsmtQual'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6, 0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_BsmtQual'].isnull()))
data['cnvrt_BsmtQual'].describe() 
# data['cnvrt_BsmtQual']

In [None]:
# Convert 'BsmtCond' into numeric variable
data['cnvrt_BsmtCond']=data['BsmtCond'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_BsmtCond'].isnull()))
data['cnvrt_BsmtCond'].describe() 
# data['cnvrt_BsmtCond']

In [None]:
# Convert 'BsmtExposure' into numeric variable
data['cnvrt_BsmtExposure']=data['BsmtExposure'].replace(['Gd','Av','Mn','No','NA'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_BsmtExposure'].isnull()))
data['cnvrt_BsmtExposure'].describe() 
# data['cnvrt_BsmtExposure']

In [None]:
# Convert 'BsmtFinType1' into numeric variable
data['cnvrt_BsmtFinType1']=data['BsmtFinType1'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA'],
                                                 list(range(7,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_BsmtFinType1'].isnull()))
data['cnvrt_BsmtFinType1'].describe() 
# data['cnvrt_BsmtFinType1']

In [None]:
# Convert 'BsmtFinType2' into numeric variable
data['cnvrt_BsmtFinType2']=data['BsmtFinType2'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA'],
                                                 list(range(7,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_BsmtFinType2'].isnull()))
data['cnvrt_BsmtFinType2'].describe() 
# data['cnvrt_BsmtFinType2']

In [None]:
# Convert 'HeatingQC' into numeric variable
data['cnvrt_HeatingQC']=data['HeatingQC'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_HeatingQC'].isnull()))
data['cnvrt_HeatingQC'].describe() 
# data['cnvrt_HeatingQC']

In [None]:
# Convert 'KitchenQual' into numeric variable
data['cnvrt_KitchenQual']=data['KitchenQual'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(data['cnvrt_KitchenQual'].isnull()))
data['cnvrt_KitchenQual'].describe() 
# data['cnvrt_KitchenQual']

In [None]:
# Convert 'Functional' into numeric variable
data['cnvrt_Functional']=data['Functional'].replace(['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],
                                                 list(range(8,0,-1)), inplace = False)
                
#Check the results
print(np.sum(data['cnvrt_Functional'].isnull()))
data['cnvrt_Functional'].describe() 
# data['cnvrt_Functional']

In [None]:
# Convert 'FireplaceQu' into numeric variable
data['cnvrt_FireplaceQu']=data['FireplaceQu'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_FireplaceQu'].isnull()))
data['cnvrt_FireplaceQu'].describe() 
# data['cnvrt_FireplaceQu']

In [None]:
# Convert 'GarageFinish' into numeric variable
data['cnvrt_GarageFinish']=data['GarageFinish'].replace(['Fin','RFn','Unf','NA'],
                                                 list(range(4,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_GarageFinish'].isnull()))
data['cnvrt_GarageFinish'].describe() 
# data['cnvrt_GarageFinish']

In [None]:
# Convert 'GararageQual' into numeric variable
data['cnvrt_GarageQual']=data['GarageQual'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_GarageQual'].isnull()))
data['cnvrt_GarageQual'].describe() 
# data['cnvrt_GarageQual']

In [None]:
# Convert 'GararageCond' into numeric variable
data['cnvrt_GarageCond']=data['GarageCond'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_GarageCond'].isnull()))
data['cnvrt_GarageCond'].describe() 
# data['cnvrt_GarageCond']

In [None]:
# Convert 'PavedDrive' into numeric variable
data['cnvrt_PavedDrive']=data['PavedDrive'].replace(['Y','P','N'],
                                                 list(range(3,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_PavedDrive'].isnull()))
data['cnvrt_PavedDrive'].describe() 
# data['cnvrt_PavedDrive']

In [None]:
# Convert 'PoolQC' into numeric variable
data['cnvrt_PoolQC']=data['PoolQC'].replace(['Ex','Gd','TA','Fa','NA'],
                                                 list(range(5,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_PoolQC'].isnull()))
data['cnvrt_PoolQC'].describe() 
# data['cnvrt_PoolQC']

In [None]:
# Convert 'Fence' into numeric variable
data['cnvrt_Fence']=data['Fence'].replace(['GdPrv','MnPrv','GdWo','MnWw','NA'],
                                                 list(range(5,0,-1)), inplace = False)

#Check the results
print(np.sum(data['cnvrt_Fence'].isnull()))
data['cnvrt_Fence'].describe() 
# data['cnvrt_Fence']

In [None]:
np.sum(data.isnull())

### Outliers

In [None]:
# data.info()

In [None]:
# Check for outliers
plt.boxplot(data['SalePrice'])
plt.show()

In [None]:
from scipy.stats import iqr
df=data
# Indices of variables to check for outliers
var_ind_num1=[3, 4, 19, 20, 26, 34, 36, 37, 38]+list(range(43, 53))+[54, 56, 59, 61, 62]+list(range(66, 72))+list(range(75, 78))+[80]

# Function to detect outliers
def outlier_detector(df,var_ind_num1):
    outlier_flags=pd.DataFrame(0,index=df.index,columns=[df.columns[i] for i in var_ind_num1])
    for i in var_ind_num1:
        if i<len(df.columns) and np.issubdtype(df.iloc[:, i].dtype,np.number):
            col = df.iloc[:, i]
            lower_bound=np.quantile(col, 0.25) - 1.5 * iqr(col)
            upper_bound=np.quantile(col, 0.75) + 1.5 * iqr(col)
            outliers=(col<lower_bound) | (col>upper_bound)
            outlier_flags.loc[outliers, df.columns[i]] = 1
    return outlier_flags

# Detect outliers
outlier_flags=outlier_detector(df,var_ind_num1)

# Calculate the number of outliers per column (feature)
outlier_counts_per_feature=outlier_flags.sum(axis=0)

# Calculate the percentage of outliers per column (feature)
total_data_points=len(df)
percentage_outliers_per_feature=(outlier_counts_per_feature/total_data_points)*100

# Combine the counts and percentages into a single DataFrame
outlier_summary=pd.DataFrame({
    'Number of Outliers':outlier_counts_per_feature,
    'Percentage of Outliers':percentage_outliers_per_feature
})

# Find the feature with the most outliers
most_outliers_feature=outlier_counts_per_feature.idxmax()
most_outliers_count=outlier_counts_per_feature.max()
percentage_outliers_most_feature=percentage_outliers_per_feature[most_outliers_feature]

# Sort and display the outlier summary DataFrame
sorted_outlier_summary=outlier_summary.sort_values(by='Number of Outliers', ascending=False)
print("\nSorted outlier summary:")
print(sorted_outlier_summary)

In [None]:
# data.drop(['EnclosedPorch','BsmtFinSF2'],axis=1,inplace=True)
print(data.shape)

In [None]:
# Indices of variables to check for outliers
var_ind_num1 =[3,4,19,20,26,34,36,37,38]+list(range(43,53))+[54,56,59,61,62]+list(range(66,72))+list(range(75,78))+[80]

# Function to detect outliers
def outlier_detector(df,var_ind_num1):
    outlier_flags = pd.DataFrame(0,index=df.index,columns=[df.columns[i] for i in var_ind_num1])
    for i in var_ind_num1:
        if i < len(df.columns)and np.issubdtype(df.iloc[:,i].dtype, np.number):
            col=df.iloc[:, i]
            lower_bound=np.quantile(col,0.25)-1.5*iqr(col)
            upper_bound=np.quantile(col,0.75)+1.5*iqr(col)
            outliers=(col<lower_bound)| (col>upper_bound)
            outlier_flags.loc[outliers,df.columns[i]] = 1
    return outlier_flags

# Detect outliers
outlier_flags=outlier_detector(df,var_ind_num1)

# Calculate the number of outliers per row
outlier_counts=outlier_flags.sum(axis=1)

# Sort the outlier counts
sorted_outlier_counts=outlier_counts.sort_values(ascending=False)
print(sorted_outlier_counts)

In [None]:
# data.iloc[[1298,1182,523], :]

In [None]:
data.shape

In [None]:
# Scatter plot of LotFrontage vs LotArea 
plt.figure(figsize=(10, 6))

# Enhanced scatter plot with color and marker size
plt.scatter(data['LotArea'],data['LotFrontage'],alpha=0.6)

plt.title('LotFrontage vs LotArea')
plt.xlabel('LotArea')
plt.ylabel('LotFrontage')
plt.show()

In [None]:
# Box plot of LotFrontage by Neighborhood
plt.figure(figsize=(12, 8))
plt.boxplot(data['LotFrontage'])
# plt.box(data['LotArea'], data['LotFrontage'])
plt.title('LotFrontage by Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('LotFrontage')
plt.xticks(rotation=90)
plt.show()

In [None]:
summary_stats=data.groupby('Neighborhood')['LotFrontage'].describe()
print(summary_stats)

In [None]:
#summary statistics of numeric variables
data.iloc[:,var_ind_num1].describe()

# Data understanding- Phase2

In [None]:
data.info()

In [None]:
# List of all numeric variables after converting ordinal variables to numeric
var_ind_num2=[3,4,19,20,26,34,36,37,38]+list(range(43,53))+[54,56,59,61,62]+list(range(66,72))+list(range(75,78))+[80]+list(range(81,100))
# var_ind_num2
len(var_ind_num2)

In [None]:
# Histogram of numeric variables
plt.figure(figsize=(20,20))
plt.subplots_adjust(hspace=0.4,wspace=0.4)
for i in range(1,53):
    plt.subplot(9,6,i)
    plt.hist(x=data.iloc[: ,var_ind_num2[i- 1]], 
             alpha=0.4,color = 'red')
    plt.title(data.columns[var_ind_num2[i - 1]])
plt.show()

5: Bivariate data analysis

In [None]:
#Correlation analysis after converting ordinal variables to numeric
corr_table=round(data.iloc[:,var_ind_num2].corr(method='pearson'),2)
corr_table

In [None]:
#Scatter plot for numeric variables after converting ordinal variables to numeric
plt.figure(figsize=(20,20))
plt.subplots_adjust(hspace=0.3,wspace=0.3)
for i in range(1,29):
    plt.subplot(7,5,i)
    plt.scatter(x=data.iloc[: ,var_ind_num2[i - 1]],y=data['SalePrice'],alpha=0.4,color='red')
    plt.title('SalePrice vs. '+data.columns[var_ind_num2[i - 1]])
plt.show()

In [None]:
var_ind_cat2=[1,2,5,6,8,9,10]+list(range(12,17))+list(range(21,26))+[29,39,41,42,58,74,78,79] 

In [None]:
# list of categorical column
col_cat2=[data.columns[i] for i in var_ind_cat2]

# subplots
fig,axes=plt.subplots(len(col_cat2),1,figsize=(8,6*len(col_cat2)))

# If there's only one plot
if len(col_cat2)==1:
    axes=[axes]

# Loop over each categorical column and plot the boxplot
for ax,col in zip(axes,col_cat2):
    # Ensure categories are sorted to maintain consistent order
    categories=sorted(data[col].dropna().unique())
    
    # ensuring non-empty categories
    data_to_plot=[data[data[col]==category]['SalePrice'] for category in categories]
    
    # Plot the boxplot
    ax.boxplot(data_to_plot,labels=categories)
    ax.set_title(f'Box Plot of SalePrice by {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('SalePrice')
    ax.tick_params(axis='x',rotation=45)
plt.subplots_adjust(hspace=0.4)
plt.show()

# Model I: Biuld Classic Linear Regression Model

In [None]:
# Train data
train=data
train.info()
# print(train.head(4))

In [None]:
# Convert 'MSSubClass' to string type
train['MSSubClass']=train['MSSubClass'].astype(str)

In [None]:
#Split Train data into test and train for validation (named trainv & testv)
from sklearn.model_selection import train_test_split
trainv, testv = train_test_split(data, test_size = 0.3, random_state = 1234)
print(trainv.shape)
print(testv.shape)


In [None]:
#Create dummy variables for categorical variables
dummy_vars_trv=pd.get_dummies(trainv[['MSSubClass','MSZoning','Street','Alley','LandContour',
                                   'Utilities','LotConfig','Neighborhood','Condition1', 'Condition2',
                                   'BldgType','HouseStyle','RoofStyle','Heating','RoofMatl',
                                   'Exterior1st','Exterior2nd','MasVnrType','Foundation','CentralAir',
                                   'Electrical','GarageType','MiscFeature','SaleType','SaleCondition']])
                                   
# dummy_vars containing only 0 and 1                                 
dummy_vars_trv=dummy_vars_trv.astype(int)

print(dummy_vars_trv.head(2))
dummy_vars_trv.info()

In [None]:
dummy_vars_trv.shape

In [None]:
# Define base level for each categorical variable According to frequency percentage of each level
dummy_vars_trv.drop(columns = ['MSSubClass_20','MSZoning_RL','Street_Pave','Alley_NA','LandContour_Lvl','Utilities_AllPub',
                           'LotConfig_Inside', 'Neighborhood_NAmes', 'Condition1_Norm','Condition2_Norm','BldgType_1Fam',
                           'HouseStyle_1Story','RoofStyle_Gable','RoofMatl_CompShg','Exterior1st_VinylSd','Exterior2nd_VinylSd',
                           'MasVnrType_none','Foundation_PConc','CentralAir_Y','Heating_GasA','Electrical_SBrkr',
                           'GarageType_Attchd','SaleType_WD','SaleCondition_Normal','MiscFeature_NA'], inplace = True)
dummy_vars_trv.info()

In [None]:
print(dummy_vars_trv.shape)
print(trainv.shape)

In [None]:
dummy_vars_trv

In [None]:
#Define feature matrix
# Train All columns except 'SalePrice'
X_X=trainv.iloc[:,list(range(0,80))+list(range(81,100))]
print(X_X.columns)
X_trainv=pd.concat([X_X,dummy_vars_trv], axis = 1)
# X_trainv.info()
# #add constant
X_trainv=sm.add_constant(X_trainv)
X_trainv.head(2)

In [None]:
X_trainv.shape

In [None]:
# Remove coloumn'Id'
X_trainv.drop(columns=['Id'],inplace=True)
print(X_trainv.shape)
X_trainv.head()

In [None]:
X_trainv.shape

In [None]:
# Define response matrix
y_trainv=trainv['SalePrice']
print(len(y_trainv))
y_trainv

In [None]:
X_trainv=X_trainv.drop(columns=X_trainv.select_dtypes(include=['object']).columns)
# Check the data types of the cleaned DataFrame
print(X_trainv.dtypes)
X_trainv.info()

In [None]:
# Convert float64 and int32 columns to int64
X_trainv=X_trainv.astype({col:'int64' for col in X_trainv.select_dtypes(include=['float64','int32']).columns})

# Check the data types of the columns to verify the conversion
print(X_trainv.dtypes)

In [None]:
X_trainv.shape

In [None]:
#Linear regression- model 1
model_1=sm.OLS(y_trainv,X_trainv).fit()
model_1.summary()

In [None]:
# Function to plot histogram of residuals
def hist_residuals(model,bins = 50):
    #Calculate density
    from scipy import stats
    density=stats.gaussian_kde(model.resid)
    xp=np.linspace(model.resid.min(),model.resid.max(), 100)
    yp=density.pdf(xp)

    #Histogram
    plt.hist(model.resid,bins=bins, 
             color='red',alpha = 0.7,density = True)
    plt.axvline(model.resid.mean(), color='black', 
                linewidth=2, linestyle='--', label = "Average")
    plt.title('Histogram of Residuals')
    plt.xlabel('Residuals')
    plt.ylabel('Density')
    plt.plot(xp,yp,color='black',linewidth = 2)
    plt.legend()
    
    return plt.show()  

In [None]:
#QQ-plot- model 1
sm.qqplot(model_1.resid,line = 's')
plt.show()

In [None]:
# Function to plot residuals vs. fitted values
def residuals_fittedvalues_plot(model):
    # Implement Lowess algorithm
    lowess_res=sm.nonparametric.lowess(model.resid, model.fittedvalues)

    # Scatter plot: residuals vs. fitted values
    plt.scatter(x=model.fittedvalues,y = model.resid, 
                color='red',alpha = 0.7,label = 'data')
    plt.title('Residuals vs. Fitted values')
    plt.xlabel('Fitted Values',fontsize = 12)
    plt.ylabel('Residuals',fontsize = 12)
    plt.grid()

    # Add LOWESS line
    plt.plot(lowess_res[:, 0],lowess_res[:, 1],'black', 
             alpha = 0.8,linewidth = 2,label='LOWESS')
    plt.legend()

    # Top 3 observations with greates absolute value of the residual
    top3=abs(model.resid).sort_values(ascending=False)[:3]
    for i in top3.index:
        plt.annotate(i,xy=(model.fittedvalues[i],model.resid[i]),color='blue')
    
    return plt.show()

In [None]:
# Scatter plot of residuals vs. fitted values- model 1
residuals_fittedvalues_plot(model_1)

In [None]:
# trainv.loc[[825,523,691],:]

In [None]:
#Function to check Cook's distance
def influencer_detector(model,thershold = 1):
    
    #create instance of influence
    influence=model.get_influence()

    #Obtain Cook's distance for each observation
    cooks=influence.cooks_distance

    #Check observations w/ Cook's distance greater than thershold
    return np.where(cooks[0] > 1)

In [None]:
# #Check Cook's ditance- model 1
# influencer_detector(model_1)

In [None]:
# trainv.loc[[65,   83,   91,  761,  819,  897, 1059, 1118], :]

In [None]:
# #List of all numeric variables after converting ordinal variables to numeric (not including 'Saleprice')
# var_ind_num3=[3,4,19,20,26,34,36,37,38]+ list(range(43,53))+[54,56,59,61,62]+list(range(66,72))+list(range(75,78))+list(range(81,100))
# # var_ind_num2
# len(var_ind_num3)

In [None]:
#Function to check multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    #Calculating VIF
    vif = pd.DataFrame()
    vif["variables"]=X.columns
    vif["VIF"]=[variance_inflation_factor(X, i) for i in range(X.shape[1])]
    return(vif)

In [None]:
#Check multicollinearity for numeric variables
calc_vif(X_trainv)
#Note: If VIF > 10 then multicollinearity is high

In [None]:
# Regularization: Techniques like Ridge or Lasso regression can help address multicollinearity by adding a penalty to the size of coefficients
# ], which can mitigate the impact of collinear variables.
# very large: Extreme collinearity, meaning the variable is nearly a perfect linear combination of other variables.Bad model!

In [None]:
# Box-Cox transformation
from sklearn.preprocessing import PowerTransformer
boxcox=PowerTransformer(method='box-cox')
boxcox.fit(y_trainv.values.reshape(-1, 1))
boxcox.lambdas_

In [None]:
#Linear regression-model 1 with Box-Cox transformation
trans_y_trainv=boxcox.fit_transform(y_trainv.values.reshape(-1, 1))
plt.hist(trans_y_trainv,color='red',alpha = 0.5,bins = 50)
plt.show()

In [None]:
model_1=sm.OLS(trans_y_trainv,X_trainv).fit()
model_1.summary()

In [None]:
# Histogram of residuals- model 1 with Box-Cox transformation
hist_residuals(model_1)

In [None]:
# QQ-plot- model 1 with Box-Cox transformation
sm.qqplot(model_1.resid,line='s')
plt.show()

In [None]:
# Scatter plot of residuals vs. fitted values- model 1 with Box-Cox transformation
residuals_fittedvalues_plot(model_1)

In [None]:
# trainv.loc[[462,632,523], :]

In [None]:
# Check Cook's ditance- model 1 with Box-Cox transformation
# influencer_detector(model_1)

In [None]:
# X_trainv.loc[[ 22,   65,   83,   91,  761,  819,  897, 1059, 1118], :]

In [None]:
# Note: Remove observations 
 # X_train.drop(index =[1322,816,970], inplace = True)
 # y_train.drop(index =[591,816,970], inplace = True)
# trans_y_train=PowerTransformer(method='box-cox').fit_transform(y_train.values.reshape(-1, 1))

In [None]:
model_1 = sm.OLS(trans_y_trainv,X_trainv).fit()
model_1.summary()

In [None]:
# Get the p-values from the model
pvalues=model_1.pvalues

# Define the significance level (alpha)
alpha=0.05

# Identify columns with p-value greater than alpha
non_significant_columns=pvalues[pvalues>alpha].index
print("Non-significant columns to drop:",non_significant_columns)
# Drop non-significant columns from the original X_trainv DataFrame
X_trainv_significant=X_trainv.drop(columns=non_significant_columns)

# Refit the model with the remaining significant columns
model_1_significant=sm.OLS(trans_y_trainv, X_trainv_significant).fit()

# Summary of the new model
print(model_1_significant.summary())
# Start with the original X_trainv
X_trainv_current=X_trainv.copy()

while True:
    model = sm.OLS(trans_y_trainv, X_trainv_current).fit()
    pvalues = model.pvalues

    # Check if all p-values are below the significance level
    if pvalues.max()>alpha:
        # Drop the column with the highest p-value
        non_significant_column = pvalues.idxmax()
        print(f"Dropping non-significant column: {non_significant_column} with p-value: {pvalues.max()}")
        X_trainv_current = X_trainv_current.drop(columns=[non_significant_column])
    else:
        break

In [None]:
 # X_train.drop(index=[1322,816,970],inplace = True)
y_trainv_current=y_trainv.drop(columns=[non_significant_column])
# X_trainv_current=X_trainv_current.drop(columns=[non_significant_column])

In [None]:
y_trainv_current.shape

In [None]:
X_trainv.shape

In [None]:
X_trainv_current.shape

In [None]:
X_trainv_current=sm.add_constant(X_trainv_current)

In [None]:
trans_y_trainv_current=PowerTransformer(method='box-cox').fit_transform(y_trainv_current.values.reshape(-1, 1))

In [None]:
#Linear regression - model 1 with Box-Cox transformation and significant columns (t_test)
model_final=sm.OLS(trans_y_trainv_current,X_trainv_current).fit()
print(model_final.summary())

In [None]:
# Histogram of residuals- mode_final
hist_residuals(model_final)

In [None]:
#Scatter plot of residuals vs. fitted values- mode_final
residuals_fittedvalues_plot(model_final)

In [None]:
# trainv.loc[[825,523,632], :]

In [None]:
# #Check Cook's ditance- model_final
influencer_detector(model_final)

In [None]:
# trainv.loc[[ 173, 185, 507, 995], :]

In [None]:
# # Filter out variables with high VIF (>10)
# filtered = vif_df[vif_df['VIF'] > 10]
# # filtered_vif_df = vif_df[vif_df['VIF'] <= 10]
# print(filtered)
# # print(filtered_vif_df)

## Evaluation on the Validation Set (Classic Regression)

In [None]:
testv.info()

In [None]:
 #Create dummy variables for categorical variables
dummy_vars_tev=pd.get_dummies(testv[['MSSubClass','MSZoning','Street','Alley','LandContour',
                                   'Utilities','LotConfig','Neighborhood','Condition1', 'Condition2',
                                   'BldgType','HouseStyle','RoofStyle','Heating','RoofMatl',
                                   'Exterior1st','Exterior2nd','MasVnrType','Foundation','CentralAir',
                                   'Electrical','GarageType','MiscFeature','SaleType','SaleCondition']])
                                   
# dummy_vars containing only 0 and 1                                 
dummy_vars_tev=dummy_vars_tev.astype(int)

# print(dummy_vars_tev.head(2))
dummy_vars_tev.info()

In [None]:
dummy_vars_trv.shape

In [None]:
dummy_vars_tev.shape

In [None]:
# Define base level for each categorical variable Acco8rding to train data set
dummy_vars_tev.drop(columns=['MSSubClass_20','MSZoning_RL','Street_Pave','Alley_NA','LandContour_Lvl','Utilities_AllPub',
                           'LotConfig_Inside', 'Neighborhood_NAmes', 'Condition1_Norm','Condition2_Norm','BldgType_1Fam',
                           'HouseStyle_1Story','RoofStyle_Gable','RoofMatl_CompShg','Exterior1st_VinylSd','Exterior2nd_VinylSd',
                           'MasVnrType_none','Foundation_PConc','CentralAir_Y','Heating_GasA','Electrical_SBrkr',
                           'GarageType_Attchd','SaleType_WD','SaleCondition_Normal','MiscFeature_NA'], inplace = True)
dummy_vars_tev.info()

In [None]:
dummy_vars_tev=dummy_vars_tev.reindex(columns=dummy_vars_trv.columns,fill_value=0)

In [None]:
print(dummy_vars_trv.shape)
print(trainv.shape)


print(dummy_vars_tev.shape)
print(testv.shape)
# print(testv.info)

In [None]:
# Define feature matrix
# Train All columns except 'SalePrice'
X_X=testv.iloc[:,list(range(0,80))+list(range(81,100))]
# print(X_X.columns)
X_testv=pd.concat([X_X,dummy_vars_tev],axis = 1)
#add constant
# X_testv=sm.add_constant(X_testv) 
X_testv = sm.add_constant(X_testv,has_constant='add')
X_testv.shape
print(X_testv.columns)

In [None]:
# Remove coloumn'Id'
X_testv.drop(columns=['Id'],inplace=True)
X_testv.shape

In [None]:
X_testv=X_testv.drop(columns=X_testv.select_dtypes(include=['object']).columns)
# Check the data types of the cleaned DataFrame
print(X_testv.dtypes)
print(X_testv.shape)

In [None]:
# Convert float64 and int32 columns to int64
X_testv=X_testv.astype({col:'int64' for col in X_testv.select_dtypes(include=['float64','int32']).columns})

# Check the data types of the columns to verify the conversion
print(X_testv.dtypes)

In [None]:
#Linear regression - model 1 with Box-Cox transformation and significant columns (t_test)
model_final=sm.OLS(trans_y_trainv_current,X_trainv_current).fit()
# print(model_final.summary())

In [None]:
trans_y_trainv_current=PowerTransformer(method = 'box-cox').fit_transform(y_trainv_current.values.reshape(-1, 1))

In [None]:
X_trainv.info()

In [None]:
X_testv.info()

In [None]:
trans_y_trainv_current.shape

In [None]:
#Linear regression- model 1 with Box-Cox transformation and significant columns (t_test)
model_final=sm.OLS(trans_y_trainv_current,X_trainv_current).fit()
print(model_final.summary())

In [None]:
X_trainv_current

In [None]:
 X_testv.shape

In [None]:
# Ensure X_testv only contains the significant columns
X_testv_significant=X_testv[X_trainv_current.columns]

# Predict using the final model
testv_pred=model_final.predict(X_testv_significant)

# Output the predictions
print(testv_pred)

In [None]:
# X_testv_significant

In [None]:
# X_testv_current=X_testv.drop(columns=[non_significant_column])

In [None]:
# X_testv_current.info()

In [None]:
# Predict using the final model
testv_pred=model_final.predict(X_testv_significant)

# Output the predictions
print(testv_pred)

In [None]:
# Inverse transformation of predicted values (Linear Model)
testv['pred_lm']=pd.Series(boxcox.inverse_transform(testv_pred.values.reshape(-1, 1)).reshape(-1), 
                         index = testv_pred.index)
testv['pred_lm']

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y = testv['pred_lm'], 
            c='black',alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction T_test')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),testv['pred_lm'].max(),100)
plt.plot(xp,xp,c ='red',linewidth = 3)
plt.show()

In [None]:
from scipy.stats import iqr
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Absolute error
abs_err_lm=abs(testv['SalePrice']-testv['pred_lm'])

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae=mean_absolute_error(testv['SalePrice'],testv['pred_lm'])
mse=mean_squared_error(testv['SalePrice'],testv['pred_lm'])
rmse=np.sqrt(mse)

# Display the models comparison DataFrame
models_comp=pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_lm.mean(),
                            'MSE' :                   mse,
                            'RMSE' :                  rmse,
                            'Median of AbsErrors' : abs_err_lm.median(),
                            'SD of AbsErrors' :     abs_err_lm.std(),
                            'IQR of AbsErrors':     iqr(abs_err_lm),
                            'Min of AbsErrors':     abs_err_lm.min(),
                            'Max of AbsErrors':     abs_err_lm.max()}, 
                           index = ['LM_t-Test'])
models_comp

## Forward Selection

In [None]:
# Define function to fit linear regression
def fit_lm(feature_set,y,X):
    reg_model=sm.OLS(y, X[['const']+list(feature_set)]).fit()
    return{'model':reg_model, 
            'r2':reg_model.rsquared,
            'adj_r2':reg_model.rsquared_adj,
            'aic':reg_model.aic,
            'bic':reg_model.bic}

In [None]:
# Define function to do forward selection
def fwd_selection(features, y, X):
    res = []
    # Pull out features still needed to process
    remaining_features = [ _ for _  in X.iloc[:, 1:].columns if _ not in features]
          
    # Fit linear model and save the results
    for f in remaining_features:
        res.append(fit_lm(features+[f], y, X))
   
    models=pd.DataFrame(res)
    
    # Choose the model with the highest R squared
    best_model=models.iloc[models['r2'].argmax()] 
    
    # Return the best model
    return best_model

In [None]:
X_trainv.shape

In [None]:
len(trans_y_trainv)

In [None]:
#Forward selection implementation
import time #to measure the processing time
fwd_models=pd.DataFrame(columns=['model','r2','adj_r2','aic','bic'])
start_time=time.time()
features=[]
for i in range(1,len(X_trainv.iloc[:, 1:].columns)+1):
    fwd_models.loc[i]=fwd_selection(features,trans_y_trainv,X_trainv)
    features=fwd_models.loc[i,'model'].model.exog_names[1:]
end_time=time.time()
print('The Processing time is: ',end_time-start_time,'seconds')

### Evaluation on the Validation Set (Forward selection)

In [None]:
#Models evaluation
plt.figure(figsize = (20,12))
plt.subplots_adjust(hspace = 0.3,wspace = 0.3)
for i in range(1,5):
    plt.subplot(2,2,i)
    plt.plot(fwd_models.iloc[:, i],'r-o',alpha=0.3,linewidth=2,)
    plt.title(fwd_models.columns[i])
    plt.xlabel('# of Predictors',fontsize=12)
    plt.xticks(range(0,214,15))
    plt.ylabel(fwd_models.columns[i],fontsize=12)
    plt.grid(True)
# Show the plot
plt.show()

In [None]:
#Liner model with 105 Variables (based on r2 and observing other 3 plots )
fwd_models.loc[105,'model'].params

In [None]:
#Selected features 
fwd_models.loc[105,'model'].model.exog_names

In [None]:
#Predict on test- model 2
pred_fwd=fwd_models.loc[105, 'model'].predict(X_testv[fwd_models.loc[105,'model'].model.exog_names])
pred_fwd=pd.Series(boxcox.inverse_transform(pred_fwd.values.reshape(-1, 1)).reshape(-1),index=testv.index)
# pred_fwd

In [None]:
len(pred_fwd)

In [None]:
len(testv['SalePrice'])

In [None]:
#Absolute error
abs_err_fwd=abs(testv['SalePrice']-pred_fwd)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_fwd =mean_absolute_error(testv['SalePrice'], pred_fwd)
mse_fwd=mean_squared_error(testv['SalePrice'], pred_fwd)
rmse_fwd=np.sqrt(mse_fwd)

#Absolute error mean, median, sd, IQR, max, min
models_comp = pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_fwd.mean(),
                            'MSE' :                   mse_fwd,
                            'RMSE' :                 rmse_fwd,
                            'Median of AbsErrors' : abs_err_fwd.median(),
                            'SD of AbsErrors' :     abs_err_fwd.std(),
                            'IQR of AbsErrors':     iqr(abs_err_fwd),
                            'Min of AbsErrors':     abs_err_fwd.min(),
                            'Max of AbsErrors':     abs_err_fwd.max()}, 
                            index = ['LM_FWD'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y = pred_fwd, 
            c='black',alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- FWD Selection')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_fwd.max(),100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

## Backward Elimination

In [None]:
import itertools

#Define function to do backward elimination
def bwd_elimination(features,y,X):
    res = []
    
    #Fit linear model and save the results
    for f in itertools.combinations(features,len(features)-1):
        res.append(fit_lm(f,y,X))
    
    models=pd.DataFrame(res)
    
    #Choose the model with the highest R squared
    best_model=models.iloc[models['r2'].argmax()] 
          
    #Return the best model
    return best_model

In [None]:
X_trainv.shape

In [None]:
#Backward elimination implementation
bwd_models=pd.DataFrame(columns=['model','r2','adj_r2','aic','bic'])
start_time=time.time()
features=X_trainv.columns
while(len(features)>1):  
    bwd_models.loc[len(features)-1]=bwd_elimination(features,trans_y_trainv,X_trainv)
    features=bwd_models.loc[len(features)-1]['model'].model.exog_names[1:]
end_time=time.time()
print('The Processing time is: ',end_time-start_time,'seconds')

In [None]:
bwd_models

## Evaluation on the Validation Set (Backward Elimination)

In [None]:
#Models evaluation
plt.figure(figsize = (20,12))
plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
for i in range(1, 5):
    plt.subplot(2, 2, i)
    plt.plot(bwd_models.iloc[:, i], 'r-o', alpha = 0.3, linewidth=2)
    plt.title(bwd_models.columns[i])
    plt.xlabel('# of Predictors', fontsize=12)
    plt.xticks(range(0, 214,15))
    plt.ylabel(bwd_models.columns[i],fontsize=12)
    plt.grid(True)
# Show the plot
plt.show()

In [None]:
#Liner model with 120 Variables (based on r2)
bwd_models.loc[120,'model'].params

In [None]:
#Selected features 
bwd_models.loc[120,'model'].model.exog_names

In [None]:
#Predict on test- model 3
pred_bwd=bwd_models.loc[120,'model'].predict(X_testv[bwd_models.loc[120,'model'].model.exog_names])
pred_bwd=pd.Series(boxcox.inverse_transform(pred_bwd.values.reshape(-1, 1)).reshape(-1), index = testv.index)
pred_bwd

In [None]:
#Absolute error
abs_err_bwd=abs(testv['SalePrice']-pred_bwd)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_bwd=mean_absolute_error(testv['SalePrice'],pred_bwd)
mse_bwd=mean_squared_error(testv['SalePrice'],pred_bwd)
rmse_bwd=np.sqrt(mse_bwd)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_bwd.mean(),
                            'MSE' :                   mse_bwd,
                            'RMSE' :                 rmse_bwd,
                            'Median of AbsErrors' : abs_err_bwd.median(),
                            'SD of AbsErrors' :     abs_err_bwd.std(),
                            'IQR of AbsErrors':     iqr(abs_err_bwd),
                            'Min of AbsErrors':     abs_err_bwd.min(),
                            'Max of AbsErrors':     abs_err_bwd.max()}, 
                            index = ['LM_BWD'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_bwd, 
            c='black', alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- BWD Elimination')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_bwd.max(), 100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

## Ridge Regression

In [None]:
#Scale data before implementing Ridge regression
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_trainv_scaled=scaler.fit_transform(X_trainv)
X_trainv_scaled

In [None]:
#Implement Ridge regression - model 4
from sklearn.linear_model import Ridge
ridge_reg=Ridge(alpha=0.1) #lambda = 0.1 (regularization hyperparameter)
model_4=ridge_reg.fit(X_trainv_scaled,trans_y_trainv)

In [None]:
model_4.coef_

In [None]:
# Grid
lambda_grid=10**np.linspace(3,-3,100)
lambda_grid

In [None]:
# K-fold cross validation to choose the best model
from sklearn.model_selection import cross_val_score

cv_errors=np.zeros(shape=len(lambda_grid)) #to save cv results

for i in range(len(lambda_grid)):
    ridge_reg = Ridge(alpha = lambda_grid[i])
    scores=cross_val_score(estimator = ridge_reg, 
                             X=X_trainv_scaled, 
                             y=trans_y_trainv,
                             scoring='neg_root_mean_squared_error',
                             cv=5, n_jobs = -1)
    cv_errors[i]=scores.mean() 


cv_errors

In [None]:
# Best lambda
best_lamda=lambda_grid[np.argmax(cv_errors)]
best_lamda

In [None]:
# Best model coefs:
ridge_reg=Ridge(alpha = best_lamda)
model_4=ridge_reg.fit(X_trainv_scaled, trans_y_trainv)
model_4.coef_

## Evaluation on the Validation Set (Ridge Regression)

In [None]:
# Predict on test- model 4
# Sacle test data set
X_testv_scaled=scaler.transform(X_testv)
pred_ridge=model_4.predict(X_testv_scaled)
pred_ridge=pd.Series(boxcox.inverse_transform(pred_ridge).reshape(-1),index = testv.index)
pred_ridge

In [None]:
# Absolute error
abs_err_ridge=abs(testv['SalePrice']-pred_ridge)


# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_ridge =mean_absolute_error(testv['SalePrice'], pred_ridge)
mse_ridge=mean_squared_error(testv['SalePrice'], pred_ridge)
rmse_ridge=np.sqrt(mse_bwd)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_ridge.mean(),
                            'MSE' :                   mse_ridge,
                            'RMSE' :                   rmse_ridge,
                            'Median of AbsErrors' :   abs_err_ridge.median(),
                            'SD of AbsErrors' :       abs_err_ridge.std(),
                            'IQR of AbsErrors':       iqr(abs_err_ridge),
                            'Min of AbsErrors':       abs_err_ridge.min(),
                            'Max of AbsErrors':       abs_err_ridge.max()}, 
                            index = ['LM_Ridge'])])
models_comp


In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_ridge, 
            c='black',alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- Ridge Regression')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_ridge.max(),100)
plt.plot(xp,xp,c='red', linewidth=3)
plt.show()

## LASSO Regression

In [None]:
#Implement LASSO regression- model 5
from sklearn.linear_model import Lasso
lasso_reg=Lasso(alpha = 0.1) #lambda = 0.1 (regularization hyperparameter)
model_5=lasso_reg.fit(X_trainv_scaled, trans_y_trainv)

In [None]:
model_5.coef_

In [None]:
# K-fold cross validation to choose the best model
cv_errors=np.zeros(shape=len(lambda_grid)) #to save cv results

for i in range(len(lambda_grid)):
    lasso_reg=Lasso(alpha = lambda_grid[i])
    scores= cross_val_score(estimator=lasso_reg, 
                             X=X_trainv_scaled, 
                             y=trans_y_trainv,
                             scoring='neg_root_mean_squared_error',
                             cv=5, n_jobs = -1)
    cv_errors[i]=scores.mean() 
 
cv_errors

In [None]:
# Best lambda
best_lamda=lambda_grid[np.argmax(cv_errors)]
best_lamda

In [None]:
# Best model coefs:
lasso_reg=Lasso(alpha=best_lamda)
model_5=lasso_reg.fit(X_trainv_scaled,trans_y_trainv)
model_5.coef_

## Evaluation on the Validation Set (LASSO Regression)

In [None]:
#Predict on test- model 5
pred_lasso=model_5.predict(X_testv_scaled)
pred_lasso=pd.Series(boxcox.inverse_transform(pred_lasso.reshape(-1, 1)).reshape(-1),index = testv.index)
pred_lasso

In [None]:
#Absolute error
abs_err_lasso=abs(testv['SalePrice']-pred_lasso)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_lasso=mean_absolute_error(testv['SalePrice'],pred_lasso)
mse_lasso=mean_squared_error(testv['SalePrice'],pred_lasso)
rmse_lasso=np.sqrt(mse_lasso)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_lasso.mean(),
                            'MSE' :                   mse_lasso,
                            'RMSE' :                   rmse_lasso,
                            'Median of AbsErrors' :   abs_err_lasso.median(),
                            'SD of AbsErrors' :       abs_err_lasso.std(),
                            'IQR of AbsErrors':       iqr(abs_err_lasso),
                            'Min of AbsErrors':       abs_err_lasso.min(),
                            'Max of AbsErrors':       abs_err_lasso.max()}, 
                            index = ['LM_LASSO'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_lasso, 
            c='black', alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- LASSO Regression')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_lasso.max(),100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

# Decision Tree

In [None]:
# Decision Tree- model 6
from sklearn.tree import DecisionTreeRegressor
#Post prunning approach
tree_reg=DecisionTreeRegressor(criterion='squared_error',
                                 max_depth=5, 
                                 min_samples_leaf=5, 
                                 ccp_alpha=0)
model_6=tree_reg.fit(X_trainv,trans_y_trainv)

In [None]:
# Plot the Tree
from sklearn.tree import plot_tree
 
# Convert the feature names from Index to list
feature_names_list=X_trainv.columns.tolist()

# Plot the Tree
plt.figure(figsize=(25,20))
plot_tree(model_6,feature_names=feature_names_list)
plt.show()

In [None]:
# Post prunning
prunning_path=model_6.cost_complexity_pruning_path(X_trainv,trans_y_trainv)
prunning_path

In [None]:
# Grid
alpha_grid=prunning_path['ccp_alphas']
alpha_grid

In [None]:
# K-fold cross validation to choose the best model
from sklearn.model_selection import cross_val_score

cv_errors=np.zeros(shape =len(alpha_grid)) #to save cv results

import time #to measure the processing time
start_time=time.time()
for i in range(len(alpha_grid)):
    tree_reg=DecisionTreeRegressor(criterion='squared_error',
                                     max_depth=5, 
                                     min_samples_leaf=5, 
                                     ccp_alpha=alpha_grid[i])
    scores = cross_val_score(estimator=tree_reg, 
                             X=X_trainv, 
                             y=trans_y_trainv,
                             scoring='neg_root_mean_squared_error',
                             cv=10,n_jobs=-1)
    cv_errors[i]=scores.mean() 

end_time=time.time()
print('The Processing time is: ',end_time-start_time, 'seconds')
cv_errors

In [None]:
# Best alpha
best_alpha=alpha_grid[np.argmax(cv_errors)]
best_alpha

In [None]:
#Best model
tree_reg=DecisionTreeRegressor(criterion='squared_error',
                                 max_depth=5, 
                                 min_samples_leaf=5, 
                                 ccp_alpha=best_alpha)


model_6=tree_reg.fit(X_trainv,trans_y_trainv)

# Convert the feature names from Index to list
feature_names_list=X_trainv.columns.tolist()


#Plot the Tree
from sklearn.tree import plot_tree
plt.figure(figsize=(25, 20))
plot_tree(model_6, feature_names=feature_names_list)
plt.show()


In [None]:
X_testv.shape

In [None]:
print(dummy_vars_tev.shape)
print(testv.shape)

## Evaluation on the Validation Set (Decision Tree)

In [None]:
 #Define feature matrix
# Train All columns except 'SalePrice'
X_X=testv.iloc[:,list(range(0,80))+list(range(81,100))]
# print(X_X.columns)
X_testv=pd.concat([X_X,dummy_vars_tev],axis = 1)
#add constant
# X_testv=sm.add_constant(X_testv) 
X_testv = sm.add_constant(X_testv,has_constant='add')
X_testv.shape
print(X_testv.columns)

In [None]:
# Remove coloumn'Id'
# X_testv.drop(columns=['Id'],inplace=True)
X_testv.shape

In [None]:
X_testv=X_testv.drop(columns=X_testv.select_dtypes(include=['object']).columns)
# Check the data types of the cleaned DataFrame
print(X_testv.dtypes)
print(X_testv.shape)

In [None]:
# Convert float64 and int32 columns to int64
X_testv=X_testv.astype({col:'int64' for col in X_testv.select_dtypes(include=['float64','int32']).columns})

# Check the data types of the columns to verify the conversion
print(X_testv.dtypes)

In [None]:
trans_y_trainv=PowerTransformer(method = 'box-cox').fit_transform(y_trainv.values.reshape(-1, 1))

In [None]:
X_trainv.info()

In [None]:
X_trainv

In [None]:
X_testv

In [None]:
X_testv=X_testv.drop(columns=['Id'])

In [None]:
X_testv

In [None]:
#Prediction using model 6
pred_tree = model_6.predict(X_testv)
pred_tree = pd.Series(boxcox.inverse_transform(pred_tree.reshape(-1, 1)).reshape(-1), 
                      index = testv.index)
pred_tree

In [None]:
#Absolute error
abs_err_tree=abs(testv['SalePrice']-pred_tree)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_tree=mean_absolute_error(testv['SalePrice'],pred_tree)
mse_tree=mean_squared_error(testv['SalePrice'],pred_tree)
rmse_tree=np.sqrt(mse_tree)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_tree.mean(),
                            'MSE' :                   mse_tree,
                            'RMSE' :                   rmse_tree,
                            'Median of AbsErrors' :   abs_err_tree.median(),
                            'SD of AbsErrors' :       abs_err_tree.std(),
                            'IQR of AbsErrors':       iqr(abs_err_tree),
                            'Min of AbsErrors':       abs_err_tree.min(),
                            'Max of AbsErrors':       abs_err_tree.max()}, 
                            index = ['Decision Tree'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_tree,c='black',alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- Decision Tree')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_tree.max(),100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

# Random Forest

In [None]:
#Sample Random Forest model (not tuned)
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=300,
                               max_features=15, 
                               criterion='squared_error',
                               max_depth=15, 
                               min_samples_leaf=5, 
                               ccp_alpha=0,
                               random_state =1234)
model_7=rf_reg.fit(X_trainv,trans_y_trainv.reshape(-1))

In [None]:
#Calculate variable importance
importance=pd.DataFrame({'importance': model_7.feature_importances_ * 100}, 
                          index = X_trainv.columns)
 # importance.sort_values
importance_sorted = importance.sort_values(by='importance',ascending=False)
plt.figure(figsize=(30,25))                                                                           
importance_sorted[:25].plot(kind='barh',color='r',legend=False)                                                                        
plt.title('Variable Importance')
plt.xlabel('MSE Increase (%)')
plt.show()

In [None]:
#Create hyper-parameters grid
import itertools
n_estimators=[100,200,500,700]
max_features=['sqrt', 'log2'] 
max_depth=[2,5,10,20]
min_samples_leaf=[5,10,15]
params_grid=list(itertools.product(n_estimators,max_features,max_depth,min_samples_leaf))
params_grid=pd.DataFrame(data=params_grid,
                           index=range(1,97), 
                           columns=['n_estimators', 
                                      'max_features', 
                                      'max_depth', 
                                      'min_samples_leaf'])
params_grid

In [None]:
#K-fold cross validation to choose the best model
from sklearn.model_selection import cross_val_score

import time #to measure the processing time
start_time = time.time()
cv_errors = np.zeros(shape = len(params_grid)) #to save cv results
for i in range(len(params_grid)):
    rf_reg=RandomForestRegressor(n_estimators=params_grid.iloc[i, 0],
                                   max_features=params_grid.iloc[i, 1], 
                                   criterion='squared_error',
                                   max_depth=params_grid.iloc[i, 2], 
                                   min_samples_leaf = params_grid.iloc[i, 3], 
                                   ccp_alpha=0)
    scores = cross_val_score(estimator=rf_reg, 
                             X=X_trainv, 
                             y=trans_y_trainv,
                             scoring='neg_root_mean_squared_error',
                             cv=10, n_jobs=-1)
    cv_errors[i]=scores.mean() 

end_time=time.time()
print('The Processing time is: ',end_time-start_time,'seconds')

cv_errors

In [None]:
#Best model
params_grid.iloc[np.argmax(cv_errors),:]

In [None]:
#Train model 7
rf_reg=RandomForestRegressor(n_estimators=params_grid.iloc[np.argmax(cv_errors),0],
                               max_features=params_grid.iloc[np.argmax(cv_errors),1], 
                               criterion='squared_error',
                               max_depth=params_grid.iloc[np.argmax(cv_errors),2], 
                               min_samples_leaf=params_grid.iloc[np.argmax(cv_errors),3], 
                               ccp_alpha=0,
                               random_state=1234)
model_7=rf_reg.fit(X_trainv,trans_y_trainv.reshape(-1))


## Evaluation on the Validation Set (Random Forest)

In [None]:
common_columns = X_trainv.columns.intersection(X_testv.columns)
X_trainv = X_trainv[common_columns]
X_testv = X_testv[common_columns]

In [None]:
#Prediction using model 7
pred_rf=model_7.predict(X_testv)
pred_rf=pd.Series(boxcox.inverse_transform(pred_rf.reshape(-1, 1)).reshape(-1), 
                    index=testv.index)
pred_rf

In [None]:
#Absolute error
abs_err_rf=abs(testv['SalePrice']-pred_rf)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_rf=mean_absolute_error(testv['SalePrice'],pred_rf)
mse_rf=mean_squared_error(testv['SalePrice'],pred_rf)
rmse_rf=np.sqrt(mse_rf)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_rf.mean(),
                            'MSE' :                   mse_rf,
                            'RMSE' :                   rmse_rf,
                            'Median of AbsErrors' :   abs_err_rf.median(),
                            'SD of AbsErrors' :       abs_err_rf.std(),
                            'IQR of AbsErrors':       iqr(abs_err_rf),
                            'Min of AbsErrors':       abs_err_rf.min(),
                            'Max of AbsErrors':       abs_err_rf.max()}, 
                            index = ['Random Forest'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_rf, c='black', alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- Random Forest')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_rf.max(),100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

# Principal Component Regression (PCR)

In [None]:
#Scale data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_trainv_scaled=scaler.fit_transform(X_trainv)

In [None]:
#PCA
from sklearn.decomposition import PCA
pca=PCA()
pca_model=pca.fit(X_trainv_scaled)

In [None]:
#Explaind variance percentage
plt.figure(figsize = (10, 6))
plt.bar(x = range(1,213), 
        height=pca_model.explained_variance_ratio_ * 100,
        color='red', 
        alpha =0.7)
plt.title('Explained Variance')
plt.xlabel('Components')
plt.ylabel('Explained Variance Percentage')
plt.xticks(ticks=range(1,213), labels= ['PC' + str(_) for _ in range(1,213)])
plt.grid()
plt.show()

In [None]:
#Cumulative explaind variance percentage
np.cumsum(np.round(pca_model.explained_variance_ratio_ * 100, 1))

In [None]:
#Principal axes in feature space
print(pca_model.components_.shape)
print(pca_model.components_)

In [None]:
#Transfrom data into new dimensions
X_trainv_pca=pca_model.transform(X_trainv_scaled)
X_trainv_pca.shape

In [None]:
#K-fold cross validation to choose the best model
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

cv_errors=np.zeros(shape=X_trainv_pca.shape[1]) #to save cv results

import time #to measure the processing time
start_time=time.time()

for i in range(X_trainv_pca.shape[1]):
    linear_reg=LinearRegression()
    scores=cross_val_score(estimator=linear_reg, 
                             X=X_trainv_pca[:, : i + 1], 
                             y=trans_y_trainv,
                             scoring='neg_root_mean_squared_error',
                             cv=5,n_jobs = -1)
    cv_errors[i]=scores.mean() 

end_time=time.time()
print('The Processing time is: ',end_time-start_time,'seconds')
cv_errors

In [None]:
#Best number of components
components_number=np.argmax(cv_errors)
components_number

In [None]:
#Correlation btw components
np.round(pd.DataFrame(X_trainv_pca[:, : components_number]).corr(), 3)

In [None]:
#Best model
from sklearn.linear_model import LinearRegression
linear_reg=LinearRegression()
model_8=linear_reg.fit(X_trainv_pca[:, : components_number], 
                         trans_y_trainv)

In [None]:
#Predict on test - model 8

#Scale data
X_testv_scaled=scaler.transform(X_testv)

#Transfrom data into new dimensions
X_testv_pca=pca_model.transform(X_testv_scaled)

#Prediction on test
pred_pcr=model_8.predict(X_testv_pca[:, : components_number])
pred_pcr=pd.Series(boxcox.inverse_transform(pred_pcr.reshape(-1, 1)).reshape(-1), 
                    index = testv.index)
pred_pcr

In [None]:
pred_pcr.shape


## Evaluation on the Validation Set (PCR)

In [None]:
#Absolute error
abs_err_pcr=abs(testv['SalePrice']-pred_pcr)

# Calculate MAE, MSE, RMSE and Absolute error median, sd, IQR, min, max
mae_pcr=mean_absolute_error(testv['SalePrice'],pred_pcr)
mse_pcr=mean_squared_error(testv['SalePrice'],pred_pcr)
rmse_pcr=np.sqrt(mse_pcr)

#Absolute error mean, median, sd, IQR, max, min
models_comp=pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors(MAE)': abs_err_pcr.mean(),
                            'MSE' :                   mse_pcr,
                            'RMSE' :                   rmse_pcr,
                            'Median of AbsErrors' :   abs_err_pcr.median(),
                            'SD of AbsErrors' :       abs_err_pcr.std(),
                            'IQR of AbsErrors':       iqr(abs_err_pcr),
                            'Min of AbsErrors':       abs_err_pcr.min(),
                            'Max of AbsErrors':       abs_err_pcr.max()}, 
                            index = ['PCR'])])
models_comp

In [None]:
# Actual vs. Prediction
plt.scatter(x=testv['SalePrice'],y=pred_pcr,c='black', alpha = 0.3)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction- Principal Component Regression')

#Add 45 degree line
xp = np.linspace(testv['SalePrice'].min(),pred_pcr.max(),100)
plt.plot(xp,xp,c='red',linewidth = 3)
plt.show()

# Model Evaluation

# Test data understanding- Phase1 
### 2: Test Data Set Description

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [None]:
# Test data
#Read Test data from file
test=pd.read_csv('test.csv')
# test.info()
test.head(10)

In [None]:
test['Id'].nunique()

In [None]:
np.sum(test.duplicated())

### 3: Check for Missing Values

In [None]:
# Step 1:Determine the type of MVs
# Know the cause
np.sum(test.isnull())

In [None]:
# Step 2: Determine the extent of MVs
#S ummary of MVs in each column
mvs_summary = pd.DataFrame({'freq':np.sum(test.isnull())})
mvs_summary['pct'] = round(mvs_summary['freq']/test.shape[0]* 100, 1)
mvs_summary.sort_values(by ='pct', ascending = False)

In [None]:
#Summary of MVs for each case
test.loc[:,'mvs']=test.apply(lambda row: np.sum(row.isnull()), axis = 1)
test.sort_values(by ='mvs',ascending = False)
test.loc[:,'mvs_pct']=round(test.apply(lambda row: np.sum(row.isnull())/(test.shape[1]-3)*100,axis = 1),1)
#(data.shape[1]-3), 3 refers to 3 extra columns (id, mvs, mvs-pct)
test.sort_values(by='mvs',ascending = False)

#Decision: Modify elements that are known as null but still contain information (Descriptive Statistics)

In [None]:
 # Modifying 'MasVnrType'
test.loc[test['MasVnrArea']==0].shape[0]

In [None]:
test.loc[test['MasVnrArea'] == 0,'MasVnrType'] ='none'
np.sum(test['MasVnrType'].isnull())

In [None]:
np.sum(test['MasVnrArea'].isnull())

In [None]:
test.loc[test['MasVnrArea'].isnull()]

In [None]:
test.loc[test['MasVnrType'].isnull()]

In [None]:
# test[['MasVnrArea','MasVnrType']]

In [None]:
# Modifying'FireplaceQu'
test.loc[test['Fireplaces']==0,'FireplaceQu']='NA'

In [None]:
np.sum(test['Fireplaces']==0)

In [None]:
np.sum(test['FireplaceQu']=='NA')

In [None]:
np.sum(test['FireplaceQu'].isnull())

In [None]:
# Modifying'PoolQC'
test.loc[test['PoolArea']==0,'PoolQC']='NA'

In [None]:
np.sum(test['PoolArea']==0)

In [None]:
np.sum(test['PoolQC']=='NA')

In [None]:
np.sum(test['PoolQC'].isnull())

In [None]:
# test[['PoolArea','PoolQC']]

In [None]:
test.loc[(test['PoolArea'] != 0)&(test['PoolQC']=='NA'),'PoolQC']=np.nan

In [None]:
test.loc[test['PoolArea']!= 0]

In [None]:
# data.loc[1298, 'PoolQC']=np.nan

In [None]:
np.sum(test['PoolQC'].isnull())

In [None]:
test.loc[test['PoolQC'].isnull()]

In [None]:
# Modifying'Fence'
np.sum(test['Fence'].isnull())

In [None]:
test.loc[test['Fence'].isnull(),'Fence']='NA'

In [None]:
np.sum(test['Fence'].isnull())

In [None]:
# test['Fence']

In [None]:
# Modifying'MiscFeature'
np.sum(test['MiscFeature'].isnull())

In [None]:
test.loc[test['MiscFeature'].isnull(),'MiscFeature']='NA'

In [None]:
np.sum(test['MiscFeature'].isnull())

In [None]:
# test['MiscFeature']

In [None]:
# Modifying 'GarageType','GarageCars','GarageYrBlt','GarageFinish','GarageQual'and'GarageCond'
np.sum(test['GarageArea']==0)

In [None]:
test.loc[test['GarageArea']==0,['GarageType','GarageFinish','GarageQual','GarageCond']]='NA'
test.loc[test['GarageArea']==0,['GarageYrBlt','GarageCars']]=0
# test[['GarageArea','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond','GarageCars']]

In [None]:
np.sum(test['GarageType'].isnull())

In [None]:
np.sum(test['GarageYrBlt'].isnull())

In [None]:
test.loc[test['GarageYrBlt'].isnull()]

In [None]:
np.sum(test['GarageFinish'].isnull())

In [None]:
test.loc[test['GarageFinish'].isnull()]

In [None]:
np.sum(test['GarageQual'].isnull())

In [None]:
test.loc[test['GarageQual'].isnull()]

In [None]:
np.sum(test['GarageCond'].isnull())

In [None]:
test.loc[test['GarageCond'].isnull()]

In [None]:
np.sum(test['GarageCars'].isnull())

In [None]:
test.loc[test['GarageCars'].isnull()]

In [None]:
# Check Missing Value in 'SaleType'
np.sum(test['SaleType'].isnull())

In [None]:
test.loc[test['SaleType'].isnull()]

In [None]:
# Modifying 'LotFrontage'
np.sum(test['LotFrontage'].isnull())

In [None]:
test.loc[test['LotFrontage'].isnull()]

In [None]:
test.loc[test['LotFrontage'].isnull(),'LotFrontage']=0
# test['LotFrontage']

In [None]:
np.sum(test['LotFrontage'].isnull())

In [None]:
np.sum(test['LotFrontage']==0)

In [None]:
# Modifying 'Alley'
np.sum(test['Alley'].isnull())

In [None]:
test.loc[test['Alley'].isnull(),'Alley']='NA'
# test['Alley']

In [None]:
np.sum(test['Alley'].isnull())

In [None]:
np.sum(test['Alley']=='NA')

In [None]:
# Modifying'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
#'BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath'
np.sum(test['TotalBsmtSF']==0)

In [None]:
test.loc[test['TotalBsmtSF']==0,['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]='NA'
# test[['TotalBsmtSF','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]

In [None]:
np.sum(test['BsmtCond'].isnull())

In [None]:
np.sum(test['BsmtExposure'].isnull())

In [None]:
test.loc[test['BsmtExposure'].isnull()]

In [None]:
np.sum(test['BsmtQual'].isnull())

In [None]:
np.sum(test['BsmtFinType1'].isnull())

In [None]:
np.sum(test['BsmtFinType2'].isnull())

In [None]:
test.loc[test['BsmtFinType2'].isnull()]

In [None]:
np.sum(test['BsmtFinSF1'].isnull())

In [None]:
test.loc[test['BsmtFinSF1'].isnull()]

In [None]:
np.sum(test['BsmtFinSF2'].isnull())

In [None]:
test.loc[test['BsmtFinSF2'].isnull()]

In [None]:
np.sum(test['BsmtUnfSF'].isnull())

In [None]:
test.loc[test['BsmtUnfSF'].isnull()]

In [None]:
np.sum(test['TotalBsmtSF'].isnull())

In [None]:
test.loc[test['TotalBsmtSF'].isnull()]

In [None]:
np.sum(test['BsmtFullBath'].isnull())

In [None]:
test.loc[test['BsmtHalfBath'].isnull()]

In [None]:
np.sum(test['BsmtHalfBath'].isnull())

In [None]:
test.loc[test['BsmtHalfBath'].isnull()]

In [None]:
# Check Missing Value in 'Utilities'
test.loc[test['Utilities'].isnull()]

In [None]:
# Check Missing Value in 'MSZoning'
test.loc[test['MSZoning'].isnull()]

In [None]:
# Check Missing Value in 'Exterior1st'
test.loc[test['Exterior1st'].isnull()]

In [None]:
# Check Missing Value in 'Exterior2nd'
test.loc[test['Exterior2nd'].isnull()]

In [None]:
#Step 2: Determine the extent of MVs after Modifying
#Summary of MVs in each column after Modifying 
mvs_summary = pd.DataFrame({'freq' : np.sum(test.isnull())})
mvs_summary['pct'] = round(mvs_summary['freq'] / test.shape[0] * 100, 1)
mvs_summary.sort_values(by = 'pct', ascending = False)

In [None]:
#Summary of MVs for each case after Modifying 
test.loc[:, 'mvs']=test.apply(lambda row: np.sum(row.isnull()), axis = 1)
test.sort_values(by='mvs', ascending = False)
test.loc[:,'mvs_pct']=round( test.apply(lambda row: np.sum(row.isnull())/(test.shape[1]-3)*100, axis = 1),1)
#(data.shape[1]-3), 3 refers to 3 extra columns (id, mvs, mvs-pct)
test.sort_values(by='mvs', ascending = False)


In [None]:
test.info()

In [None]:
test['mvs_pct'].max()

In [None]:
#Step 3: Diagnose the randomness of the missing values processes
#Given the not too high percentage of missing values in columns and records (in a large dataset),
#it is acceptable to proceed with a simple imputation strategy without a detailed evaluation of whether the missing values are MAR. 
#This imputation will not significantly impact the overall results.

In [None]:
# Step 4: Select the imputation method
np.sum(test.isnull())

In [None]:
print(test[['LotFrontage','MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
                     'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces', 'GarageCars', 'GarageArea']].dtypes)


In [None]:
# Create a dataframe to save mean of numeric variables for different imputation methods.
# Method : mean substitution
# Substiude missing values in numeric columns with the mean of each column.
float_column=['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
                     'BsmtFullBath', 'BsmtHalfBath','GarageCars', 'GarageArea','GarageYrBlt']

# Convert columns to numeric, forcing non-numeric values to NaN
for i in float_column:
    test[i]=pd.to_numeric(test[i], errors='coerce')

# Now, impute missing values with the mean of each column
for i in float_column:
    test[i]=test[i].fillna(test[i].mean())

np.sum(test.isnull())

In [None]:
# Convert the float columns to int
test['LotFrontage'] = test['LotFrontage'].astype('int64')
test['MasVnrArea'] = test['MasVnrArea'].astype('int64')
test['BsmtFinSF1'] = test['BsmtFinSF1'].astype('int64')
test['BsmtFinSF2'] = test['BsmtFinSF2'].astype('int64')
test['BsmtUnfSF'] = test['BsmtUnfSF'].astype('int64')
test[ 'TotalBsmtSF'] = test['TotalBsmtSF'].astype('int64')
test['BsmtFullBath'] = test['BsmtFullBath'].astype('int64')
test['BsmtHalfBath'] = test['BsmtHalfBath'].astype('int64')
test['GarageCars'] = test['GarageCars'].astype('int64')
test['GarageArea'] = test['GarageArea'].astype('int64')
test['GarageYrBlt'] = test['GarageYrBlt'].astype('int64')
test.info()

In [None]:
# Method : mode substitution
# Substiude missing values in categorical columns with the mode of each column.
categorical_columns=['MSZoning','Utilities','Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond','KitchenQual',
                       'BsmtExposure','BsmtFinType1','BsmtFinType2','GarageQual','PoolQC', 'SaleType', 'GarageCond','Functional','GarageFinish']

for i in categorical_columns:
    mode_value=test[i].mode()[0]
    test[i]=test[i].fillna(mode_value)

print(test.shape)
np.sum(test.isnull())

In [None]:
# Remove added coloumns
test.drop(columns=['mvs', 'mvs_pct'], inplace=True)    
test.info()

### Convert ordinal variables in to numeric (Label Encoding)

In [None]:
# Convert 'LotShape' into numeric variable
test['cnvrt_LotShape']=test['LotShape'].replace(['Reg','IR1','IR2','IR3'],
                                                 list(range(4, 0,-1)), inplace = False)
# Check the results
print(np.sum(test['cnvrt_LotShape'].isnull()))
test['cnvrt_LotShape'].describe()
# test['cnvrt_LotShape']

In [None]:
# Convert 'LandSlope' into numeric variable
test['cnvrt_LandSlope']=test['LandSlope'].replace(['Gtl','Mod','Sev'],
                                                 list(range(3, 0,-1)), inplace = False)
# Check the results
print(np.sum(test['cnvrt_LandSlope'].isnull()))
test['cnvrt_LandSlope'].describe()
# test['cnvrt_LandSlope']

In [None]:
# Convert 'ExterQua' into numeric variable
test['cnvrt_ExterQual']=test['ExterQual'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5, 0,-1)), inplace = False)
# Check the results
print(np.sum(test['cnvrt_ExterQual'].isnull()))
test['cnvrt_ExterQual'].describe() 
# test['cnvrt_ExterQual']

In [None]:
# Convert 'ExterCond' into numeric variable
test['cnvrt_ExterCond']=test['ExterCond'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5, 0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_ExterCond'].isnull()))
test['cnvrt_ExterCond'].describe() 
# test['cnvrt_ExterCond']

In [None]:
# Convert 'BsmtQual' into numeric variable
test['cnvrt_BsmtQual']=test['BsmtQual'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6, 0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_BsmtQual'].isnull()))
test['cnvrt_BsmtQual'].describe() 
# test['cnvrt_BsmtQual']

In [None]:
# Convert 'BsmtCond' into numeric variable
test['cnvrt_BsmtCond']=test['BsmtCond'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_BsmtCond'].isnull()))
test['cnvrt_BsmtCond'].describe() 
# test['cnvrt_BsmtCond']

In [None]:
# Convert 'BsmtExposure' into numeric variable
test['cnvrt_BsmtExposure']=test['BsmtExposure'].replace(['Gd','Av','Mn','No','NA'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_BsmtExposure'].isnull()))
test['cnvrt_BsmtExposure'].describe() 
# test['cnvrt_BsmtExposure']

In [None]:
# Convert 'BsmtFinType1' into numeric variable
test['cnvrt_BsmtFinType1']=test['BsmtFinType1'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA'],
                                                 list(range(7,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_BsmtFinType1'].isnull()))
test['cnvrt_BsmtFinType1'].describe() 
# test['cnvrt_BsmtFinType1']

In [None]:
# Convert 'BsmtFinType2' into numeric variable
test['cnvrt_BsmtFinType2']=test['BsmtFinType2'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA'],
                                                 list(range(7,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_BsmtFinType2'].isnull()))
test['cnvrt_BsmtFinType2'].describe() 
# test['cnvrt_BsmtFinType2']

In [None]:
# Convert 'HeatingQC' into numeric variable
test['cnvrt_HeatingQC']=test['HeatingQC'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_HeatingQC'].isnull()))
test['cnvrt_HeatingQC'].describe() 
# test['cnvrt_HeatingQC']

In [None]:
# Convert 'KitchenQual' into numeric variable
test['cnvrt_KitchenQual']=test['KitchenQual'].replace(['Ex','Gd','TA','Fa','Po'],
                                                 list(range(5,0,-1)), inplace = False)
#Check the results
print(np.sum(test['cnvrt_KitchenQual'].isnull()))
test['cnvrt_KitchenQual'].describe() 
# data['cnvrt_KitchenQual']

In [None]:
# Convert 'Functional' into numeric variable
test['cnvrt_Functional']=test['Functional'].replace(['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],
                                                 list(range(8,0,-1)), inplace = False)
                
#Check the results
print(np.sum(test['cnvrt_Functional'].isnull()))
test['cnvrt_Functional'].describe() 
# test['cnvrt_Functional']

In [None]:
# Convert 'FireplaceQu' into numeric variable
test['cnvrt_FireplaceQu']=test['FireplaceQu'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_FireplaceQu'].isnull()))
test['cnvrt_FireplaceQu'].describe() 
# test['cnvrt_FireplaceQu']

In [None]:
# Convert 'GarageFinish' into numeric variable
test['cnvrt_GarageFinish'] = test['GarageFinish'].replace(['Fin','RFn','Unf','NA'],
                                                 list(range(4,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_GarageFinish'].isnull()))
test['cnvrt_GarageFinish'].describe() 
# test['cnvrt_GarageFinish']

In [None]:
# Convert 'GararageQual' into numeric variable
test['cnvrt_GarageQual']=test['GarageQual'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_GarageQual'].isnull()))
test['cnvrt_GarageQual'].describe() 
# test['cnvrt_GarageQual']

In [None]:
# Convert 'GararageCond' into numeric variable
test['cnvrt_GarageCond']=test['GarageCond'].replace(['Ex','Gd','TA','Fa','Po','NA'],
                                                 list(range(6,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_GarageCond'].isnull()))
test['cnvrt_GarageCond'].describe() 
# test['cnvrt_GarageCond']

In [None]:
# Convert 'PavedDrive' into numeric variable
test['cnvrt_PavedDrive']=test['PavedDrive'].replace(['Y','P','N'],
                                                 list(range(3,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_PavedDrive'].isnull()))
test['cnvrt_PavedDrive'].describe() 
# test['cnvrt_PavedDrive']

In [None]:
# Convert 'PoolQC' into numeric variable
test['cnvrt_PoolQC']=test['PoolQC'].replace(['Ex','Gd','TA','Fa','NA'],
                                                 list(range(5,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_PoolQC'].isnull()))
test['cnvrt_PoolQC'].describe() 
# test['cnvrt_PoolQC']

In [None]:
# Convert 'Fence' into numeric variable
test['cnvrt_Fence']=test['Fence'].replace(['GdPrv','MnPrv','GdWo','MnWw','NA'],
                                                 list(range(5,0,-1)), inplace = False)

#Check the results
print(np.sum(test['cnvrt_Fence'].isnull()))
test['cnvrt_Fence'].describe() 
# test['cnvrt_Fence']

In [None]:
np.sum(test.isnull())

In [None]:
test.info()

In [None]:
# Convert 'MSSubClass' to string type
test['MSSubClass']=test['MSSubClass'].astype(str)


#Create dummy variables for categorical variables
dummy_vars_te=pd.get_dummies(test[['MSSubClass','MSZoning','Street','Alley','LandContour',
                                   'Utilities','LotConfig','Neighborhood','Condition1', 'Condition2',
                                   'BldgType','HouseStyle','RoofStyle','Heating','RoofMatl',
                                   'Exterior1st','Exterior2nd','MasVnrType','Foundation','CentralAir',
                                   'Electrical','GarageType','MiscFeature','SaleType','SaleCondition']])
                                   
# dummy_vars containing only 0 and 1                                 
dummy_vars_te=dummy_vars_te.astype(int)

print(dummy_vars_te.head(2))
dummy_vars_te.info()

In [None]:
##Read data from file
# data=pd.read_csv('train.csv')
# data.head(5)

In [None]:
## Train data
# train=data
# train.info()
# print(train.head(4))

In [None]:
# ##Split Train data into test and train for validation (named trainv & testv)
# from sklearn.model_selection import train_test_split
# trainv, testv = train_test_split(data, test_size = 0.3, random_state = 1234)
print(trainv.shape)
# print(testv.shape)

In [None]:
# #Create dummy variables for categorical variables
# dummy_vars_trv=pd.get_dummies(trainv[['MSSubClass','MSZoning','Street','Alley','LandContour',
#                                    'Utilities','LotConfig','Neighborhood','Condition1', 'Condition2',
#                                    'BldgType','HouseStyle','RoofStyle','Heating','RoofMatl',
#                                    'Exterior1st','Exterior2nd','MasVnrType','Foundation','CentralAir',
#                                    'Electrical','GarageType','MiscFeature','SaleType','SaleCondition']])
                                   
# # dummy_vars containing only 0 and 1                                 
# dummy_vars_trv=dummy_vars_trv.astype(int)

# print(dummy_vars_trv.head(2))
# dummy_vars_trv.info()

In [None]:
dummy_vars_trv.shape

In [None]:
dummy_vars_te.shape

In [None]:
test.shape
test.info()

In [None]:
missing_in_train=set(dummy_vars_te.columns)-set(dummy_vars_trv.columns) 
print(missing_in_train)


In [None]:
dummy_vars_te=dummy_vars_te.reindex(columns=dummy_vars_trv.columns, fill_value=0)

In [None]:
print(dummy_vars_trv.shape)
print(trainv.shape)


print(dummy_vars_te.shape)
print(test.shape)


In [None]:
X_trainv.shape

In [None]:
X_trainv

In [None]:
test.shape

In [None]:
#Define feature matrix
# Train All columns except 'SalePrice'
X_X=test.iloc[:,list(range(0,99))]
print(X_X.columns)
X_test=pd.concat([X_X,dummy_vars_te], axis = 1)
# X_trainv.info()
# #add constant
X_test=sm.add_constant(X_test)
X_test.head(2)

In [None]:
missing_in_train=set(dummy_vars_te.columns)-set(dummy_vars_trv.columns) 
print(missing_in_train)

In [None]:
dummy_vars_te=dummy_vars_te.reindex(columns=dummy_vars_trv.columns, fill_value=0)

In [None]:
# Remove coloumn'Id'
X_test.drop(columns=['Id'],inplace= True)
X_test.head()

In [None]:
X_test=X_test.drop(columns=X_test.select_dtypes(include=['object']).columns)
# Check the data types of the cleaned DataFrame
print(X_test.dtypes)

In [None]:
X_trainv.shape

In [None]:
X_test.shape

In [None]:
# Convert float and int32 columns to int64
X_test=X_test.astype({col: 'int64' for col in X_test.select_dtypes(include=['float64', 'int32']).columns})

# Check the data types of the columns to verify the conversion
print(X_test.dtypes)

 ### Prediction of Test data on model 1- Linear Regression with Box-Cox Transformation and t_test

In [None]:
# Ensure X_test only contains the significant columns
X_test_significant = X_test.reindex(columns=X_trainv_current.columns, fill_value=0)
# Predict using the final model
test_pred=model_final.predict(X_test_significant)

#Inverse transformation of predicted values
test['pred_lm']=pd.Series(boxcox.inverse_transform(test_pred.values.reshape(-1, 1)).reshape(-1), 
                         index = test_pred.index)
test['pred_lm']

### Prediction of Test data on Model 2- Forward Selection with Box-Cox Transformation

In [None]:
#Predict on test- model 2
pred_fwd=fwd_models.loc[105, 'model'].predict(X_test[fwd_models.loc[105,'model'].model.exog_names])
test['pred_fwd']=pd.Series(boxcox.inverse_transform(pred_fwd.values.reshape(-1, 1)).reshape(-1),index=test.index)
test['pred_fwd']

### Prediction of Test data on Model 3- Backward Elimination with Box-Cox Transformation 

In [None]:
#Predict on test- model 3
pred_bwd=bwd_models.loc[120,'model'].predict(X_test[bwd_models.loc[120,'model'].model.exog_names])
test['pred_bwd']=pd.Series(boxcox.inverse_transform(pred_bwd.values.reshape(-1, 1)).reshape(-1), index = test.index)
test['pred_bwd']

### Prediction of Test data on Model 4- Ridge Regression with Box-Cox Transformation 

In [None]:
# Sacle test data set
X_test_scaled=scaler.transform(X_test)
pred_ridge=model_4.predict(X_test_scaled)
test['pred_ridge']=pd.Series(boxcox.inverse_transform(pred_ridge).reshape(-1),index = test.index)
test['pred_ridge']

### Prediction of Test data on Model 5- Lasso Regression with Box-Cox Transformation 

In [None]:
#Predict on test- model 5
pred_lasso=model_5.predict(X_test_scaled)
test['pred_lasso']=pd.Series(boxcox.inverse_transform(pred_lasso.reshape(-1, 1)).reshape(-1),index = test.index)
test['pred_lasso']

### Prediction of Test data on Model 6- Decision Tree with Box-Cox Transformation 

In [None]:
#Prediction using model 6
pred_tree = model_6.predict(X_test)
test['pred_tree'] = pd.Series(boxcox.inverse_transform(pred_tree.reshape(-1, 1)).reshape(-1), 
                      index = test.index)
test['pred_tree']

### Prediction of Test data on Model 7- Random Forest with Box-Cox Transformation 

In [None]:
# common_columns = X_train.columns.intersection(X_test.columns)
# X_train = X_train[common_columns]
# X_test= X_test[common_columns]

In [None]:
#Prediction using model 7
pred_rf=model_7.predict(X_test)
test['pred_rf']=pd.Series(boxcox.inverse_transform(pred_rf.reshape(-1, 1)).reshape(-1), 
                    index=test.index)
test['pred_rf']

### Prediction of Test data on Model 8- PCR with Box-Cox Transformation 

In [None]:
#Predict on test - model 8
#Scale data
X_test_scaled=scaler.transform(X_test)

#Transfrom data into new dimensions
X_test_pca=pca_model.transform(X_test_scaled)

#Prediction on test
pred_pcr=model_8.predict(X_test_pca[:, : components_number])
test['pred_pcr']=pd.Series(boxcox.inverse_transform(pred_pcr.reshape(-1, 1)).reshape(-1), 
                    index = test.index)
test['pred_pcr']