Author:Sai Swaroop


In [82]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import plotly.express as px

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.shape

(1460, 81)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
columns_with_nulls = data.columns[data.isnull().any()]

data_with_nulls = data[columns_with_nulls]

null_counts = data_with_nulls.isnull().sum()

print("Columns with Null Values:")
print(columns_with_nulls)

print("\nNull Value Counts:")
print(null_counts)


Columns with Null Values:
Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

Null Value Counts:
LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [7]:
data.drop(columns=['Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','Id'], inplace=True)

In [9]:
null_values = data.isnull().sum()

print("Columns with Null Values:")
print(null_values[null_values > 0])

Columns with Null Values:
LotFrontage     259
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64


In [10]:
numerical_columns = data[['LotFrontage', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                          'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType',
                          'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']].select_dtypes(include=['int64', 'float64']).columns

print("Numerical Columns:")
print(numerical_columns)

Numerical Columns:
Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')


In [11]:
columns_to_impute_mean = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

for column in columns_to_impute_mean:
    data[column].fillna(data[column].mean(), inplace=True)

In [12]:
categorical_columns = data[['LotFrontage', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                          'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType',
                          'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']].select_dtypes(include=['object']).columns

print("Categorical Columns:")
print(categorical_columns)

Categorical Columns:
Index(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'],
      dtype='object')


In [13]:
columns_to_impute_mode= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                       'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

for column in categorical_columns:
    mode_value = data[column].mode()[0]  
    data[column].fillna(mode_value, inplace=True)  

In [16]:
missing_values = data.isnull().any()

if missing_values.any():
    print("Columns with Missing Values:")
    print(missing_values[missing_values].index)
else:
    print("No Missing Values in the DataFrame")


No Missing Values in the DataFrame


In [56]:
categorical_columns_indata = data.select_dtypes(include=['object']).columns
print("Categorical Columns:")
print(categorical_columns_indata)

Categorical Columns:
Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [57]:
# Create one-hot encoded DataFrame for categorical columns
one_hot_encoded = pd.get_dummies(data[categorical_columns_indata], drop_first=True)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
data_encoded = pd.concat([data, one_hot_encoded], axis=1)

# Drop the original categorical columns from the DataFrame
data_encoded.drop(columns=categorical_columns_indata, inplace=True)

# Now, data_encoded contains the one-hot encoded data

In [58]:
data_encoded

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,False,False,False,False,True,False,False,False,True,False
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,False,False,False,False,True,False,False,False,True,False
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,False,False,False,False,True,False,False,False,True,False
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,False,False,False,False,True,False,False,False,False,False
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,False,False,False,False,True,False,False,False,True,False
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,False,False,False,False,True,False,False,False,True,False
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,False,False,False,False,True,False,False,False,True,False
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,False,False,False,False,True,False,False,False,True,False


In [60]:
num_columns = data_encoded.select_dtypes(include=['int64', 'float64'])

scaler = StandardScaler()

data_scaled = scaler.fit_transform(num_columns)

data_encoded[num_columns.columns] = data_scaled

In [61]:
data_encoded

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,-0.229372,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.511418,0.575425,-0.288653,...,False,False,False,False,True,False,False,False,True,False
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574410,1.171992,-0.288653,...,False,False,False,False,True,False,False,False,True,False
2,0.073375,-0.093110,0.073480,0.651479,-0.517200,0.984752,0.830215,0.323060,0.092907,-0.288653,...,False,False,False,False,True,False,False,False,True,False
3,0.309859,-0.456474,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.574410,-0.499274,-0.288653,...,False,False,False,False,True,False,False,False,False,False
4,0.073375,0.633618,0.375148,1.374795,-0.517200,0.951632,0.733308,1.364570,0.463568,-0.288653,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.365633,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.574410,-0.973018,-0.288653,...,False,False,False,False,True,False,False,False,True,False
1456,-0.872563,0.679039,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084843,0.759659,0.722112,...,False,False,False,False,True,False,False,False,True,False
1457,0.309859,-0.183951,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.574410,-0.369871,-0.288653,...,False,False,False,False,True,False,False,False,True,False
1458,-0.872563,-0.093110,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.574410,-0.865548,6.092188,...,False,False,False,False,True,False,False,False,True,False


In [62]:
X = data_encoded.drop('SalePrice', axis=1)
y = data_encoded['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)

In [64]:
print("Mean Squared Error (Linear Regression):", mse_linear)

Mean Squared Error (Linear Regression): 2.7778480512531786e+17


In [65]:
# 2. Multiple Linear Regression
multiple_linear_model = LinearRegression()
multiple_linear_model.fit(X_train, y_train)
y_pred_multiple_linear = multiple_linear_model.predict(X_test)
mse_multiple_linear = mean_squared_error(y_test, y_pred_multiple_linear)

In [66]:
print("Mean Squared Error (Multiple Linear Regression):", mse_multiple_linear)

Mean Squared Error (Multiple Linear Regression): 2.7778480512531786e+17


In [67]:
# 3. Polynomial Regression
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)

In [68]:
print("Mean Squared Error (Polynomial Regression):", mse_poly)

Mean Squared Error (Polynomial Regression): 0.18083808539375143


In [69]:
# 4. Decision Tree Regression
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)
y_pred_decision_tree = decision_tree_model.predict(X_test)
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)

In [70]:
print("Mean Squared Error (Decision Tree Regression):", mse_decision_tree)

Mean Squared Error (Decision Tree Regression): 0.2758251930699741


In [71]:
# 5. Random Forest Regression
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)

In [72]:
print("Mean Squared Error (Random Forest Regression):", mse_random_forest)

Mean Squared Error (Random Forest Regression): 0.1293596202398942


### Finding the importance of each of the features and visualizing them

In [73]:
feature_importances = random_forest_model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [87]:
fig = px.bar(
    feature_importance_df, 
    x='Importance', 
    y='Feature', 
    orientation='h',
    title='Feature Importances'
)

fig.update_layout(
    xaxis_title='Importance',
    yaxis_title='Feature',
    yaxis_categoryorder='total ascending',
    width=1000,
    height=600
)

fig.show()


In [122]:
k = 14
selected_features = feature_importance_df['Feature'][:k]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

new_rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
new_rf_model.fit(X_train_selected, y_train)

y_pred_new_rf = new_rf_model.predict(X_test_selected)

mse_new_rf = mean_squared_error(y_test, y_pred_new_rf)
print("Mean Squared Error (New Random Forest Model):", mse_new_rf)

Mean Squared Error (New Random Forest Model): 0.12807867503419


In [123]:
print("Mean Squared Error (Original Random Forest Model):", mse_random_forest)
print("Mean Squared Error (New Random Forest Model with Selected Features):", mse_new_rf)

Mean Squared Error (Original Random Forest Model): 0.1293596202398942
Mean Squared Error (New Random Forest Model with Selected Features): 0.12807867503419
