In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('train.csv')
df.head()

In [None]:
# Data exploration - information about the data
df.shape
df.info()
df.describe()

In [None]:
# checking the number of numerical and categorical columns
df.select_dtypes(include=['int64','float64']).columns
df.select_dtypes(include=['object']).columns

In [None]:
# Dealing with missing values
df.isnull().sum()
df.columns[df.isnull().any()]

In [None]:
# finding percentage of null values
null_percent=df.isnull().sum()/df.shape[0] * 100
null_percent

In [None]:
# columns to drop (if they have more than 50% missing values)
cols_to_drop=null_percent[null_percent>50].keys()
cols_to_drop

In [None]:
df=df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
df.head()

In [None]:
# Dealing with Missing Values in Numerical Columns - Using mean
df['LotFrontage']=df['LotFrontage'].fillna(df['LotFrontage'].mean())
df['MasVnrArea']=df['MasVnrArea'].fillna(df['MasVnrArea'].mean())
df['GarageYrBlt']=df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())

In [None]:
# Dealing with Missing Values in Categorical Columns - Using mode
df['MasVnrType']=df['MasVnrType'].fillna(df['MasVnrType'].mode()[0])
df['BsmtQual']=df['BsmtQual'].fillna(df['BsmtQual'].mode()[0])
df['BsmtCond']=df['BsmtCond'].fillna(df['BsmtCond'].mode()[0])
df['BsmtExposure']=df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0])
df['BsmtFinType1']=df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode()[0])
df['BsmtFinType2']=df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0])
df['Electrical']=df['Electrical'].fillna(df['Electrical'].mode()[0])
df['FireplaceQu']=df['FireplaceQu'].fillna(df['FireplaceQu'].mode()[0])
df['GarageType']=df['GarageType'].fillna(df['GarageType'].mode()[0])
df['GarageFinish']=df['GarageFinish'].fillna(df['GarageFinish'].mode()[0])
df['GarageQual']=df['GarageQual'].fillna(df['GarageQual'].mode()[0])
df['GarageCond']=df['GarageCond'].fillna(df['GarageCond'].mode()[0])

In [None]:
plt.figure(figsize=(20,10))
sns.displot(df['SalePrice'])
plt.show()

In [None]:
# Correlation Matrix
df_2=df.drop(columns='SalePrice')
df_2.head()

In [None]:
df_2.corrwith(df['SalePrice']).plot.bar(
    figsize=(20,10),title='Correlation',grid=True
)

In [None]:
corr=df.corr()
corr

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr,cmap='coolwarm',annot=True)

In [None]:
# Selecting the columns with high correlation
high_corr=df.corr()
high_corr_features=high_corr.index[abs(high_corr['SalePrice'])>0.5]
high_corr_features

In [None]:
# Dealing with the Categorical Values
df.select_dtypes(include=['object']).columns
df=pd.get_dummies(df,drop_first=True)
df.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df.drop(columns='SalePrice'),df['SalePrice'],
                                               test_size=0.2,random_state=0)
X_train.shape
X_test.shape
Y_train.shape
Y_test.shape

(292,)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
# Building the model
# Method 1. Linear Regression
from sklearn.linear_model import LinearRegression
clf_1=LinearRegression()
clf_1.fit(X_train_scaled,Y_train)
y_pred1=clf_1.predict(X_test_scaled)

In [None]:
from sklearn.metrics import r2_score
r2_m1=r2_score(Y_test,y_pred1)
r2_m1

In [None]:
# Method 2. Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
clf_2=RandomForestRegressor()
clf_2.fit(X_train_scaled,Y_train)
y_pred2=clf_2.predict(X_test_scaled)

In [None]:
from sklearn.metrics import r2_score
r2_m2=r2_score(Y_test,y_pred2)
r2_m2

0.834066109174527

In [None]:
from sklearn.model_selection import cross_val_score
cv_1=cross_val_score(clf_2,X_train_scaled,Y_train,cv=10)
cv_1

array([0.90374208, 0.89545875, 0.67442919, 0.85287802, 0.89391675,
       0.87998997, 0.83595025, 0.87559656, 0.8766221 , 0.90526656])

In [None]:
print("Accuracy is ",np.mean(cv_1),"%")
print("Deviation is ",np.std(cv_1),"%")

Accuracy is  0.8593850239055796 %
Deviation is  0.06507772377327081 %


In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
parameters={
    'n_estimators':[200,400,600,800,1000,1200,1400,1600,1800,2000],
    'max_depth':[10,20,30,40,50,60,70,80,90,100,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'max_features':['auto','sqrt'],
    'bootstrap':[True,False]
}

In [None]:
random_search=RandomizedSearchCV(estimator=clf_2,param_distributions=parameters,
                                 n_iter=10,scoring='roc_auc',n_jobs=-1,cv=10,verbose=3)
random_search.fit(X_train_scaled,Y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [None]:
random_search.best_estimator_

In [None]:
# Building the final model
clf_final=RandomForestRegressor(bootstrap=False, max_depth=80, max_features='sqrt',
                      min_samples_leaf=2, n_estimators=800)
clf_final.fit(X_train_scaled,Y_train)
y_pred_final=clf_final.predict(X_test_scaled)

In [None]:
r2_mf=r2_score(Y_test,y_pred_final)
r2_mf

0.8313058787152489

In [None]:
from sklearn.model_selection import cross_val_score
cv_final=cross_val_score(clf_final,X_train_scaled,Y_train,cv=10)
cv_final

array([0.90670664, 0.86629118, 0.74467811, 0.83949858, 0.89376049,
       0.87818702, 0.79051486, 0.87373403, 0.89619313, 0.89755711])

In [None]:
print("Accuracy is ",np.mean(cv_final),"%")
print("Deviation is ",np.std(cv_final),"%")

Accuracy is  0.8587121154476535 %
Deviation is  0.05013160369174489 %
