<a href="https://colab.research.google.com/github/Sid-2862/Kaggle_competition/blob/main/House_Prices_Advanced_Regression_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# House Prices - Advanced Regression Techniques

# importing all the necessary libraries

In [32]:
# to avoid unnecessary warning
from warnings import filterwarnings
filterwarnings("ignore")

# read the file, numpy library
import pandas as pd
import numpy as np

# Data preprocessing and data cleaning
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# feature selection
from sklearn.feature_selection import SequentialFeatureSelector

# train test split
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score

# models required
from sklearn.linear_model import LinearRegression,Ridge,Lasso

# Metrics
from sklearn.metrics import root_mean_squared_error,mean_squared_error,r2_score,mean_absolute_error

# Reading the data & performing quality checks

In [33]:
url = "https://raw.githubusercontent.com/Sid-2862/Kaggle_competition/refs/heads/main/trainhp.csv"

df = pd.read_csv(url,na_values=("","NA"),keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [35]:
df.shape

(1460, 81)

In [36]:
# checking for missing values
s=df.isna().sum()
s[s>0]

Unnamed: 0,0
LotFrontage,259
Alley,1369
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


In [37]:
cnt = df["Id"].count()

In [38]:
# finding the columns/features which have missing data more than 50%
b = (s/cnt)*100
b[b>50]

Unnamed: 0,0
Alley,93.767123
PoolQC,99.520548
Fence,80.753425
MiscFeature,96.30137


# There are 4 features who have missing data more then 50%. Dropping the features would be better way of handling this

In [39]:
b[b>50].index

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [40]:
# converting the above index details into a list
fea_tobe_removed = list(b[b>50].index)
fea_tobe_removed

['Alley', 'PoolQC', 'Fence', 'MiscFeature']

In [41]:
df = df.drop(columns=fea_tobe_removed)
df.shape

(1460, 77)

In [42]:
# checking for duplicated values
df.duplicated().sum()

np.int64(0)

# Separating x & y featues


In [43]:
x = df.drop(columns=["Id","SalePrice"])
y = df[["SalePrice"]]

In [44]:
x.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [45]:
y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


# Feature Engineering
Feature Selection

Data Preprocessing Data Cleaning

In [46]:
cat = list(x.columns[x.dtypes=="object"])
con = list(x.columns[x.dtypes!="object"])

In [47]:
cat

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [48]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [49]:
# creating pipelines

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant",fill_value="Not Available"),
    OrdinalEncoder()
    )

con_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [50]:

pre = ColumnTransformer(
    [("cat",cat_pipe,cat),("con",con_pipe,con)]).set_output(
        transform="pandas"
    )

In [51]:
pre

In [52]:

x_pre = pre.fit_transform(x)
x_pre.head()

Unnamed: 0,cat__MSZoning,cat__Street,cat__LotShape,cat__LandContour,cat__Utilities,cat__LotConfig,cat__LandSlope,cat__Neighborhood,cat__Condition1,cat__Condition2,...,con__GarageArea,con__WoodDeckSF,con__OpenPorchSF,con__EnclosedPorch,con__3SsnPorch,con__ScreenPorch,con__PoolArea,con__MiscVal,con__MoSold,con__YrSold
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,24.0,1.0,2.0,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,6.0,2.0,2.0,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,15.0,2.0,2.0,...,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777


In [53]:

model1 = LinearRegression()
sel = SequentialFeatureSelector(model1,direction="backward",n_features_to_select="auto"
                                )
sel.fit(x_pre,y)

In [54]:
imp_cols = sel.get_feature_names_out()
imp_cols

array(['cat__Street', 'cat__Utilities', 'cat__Neighborhood',
       'cat__BldgType', 'cat__HouseStyle', 'cat__RoofMatl',
       'cat__Exterior1st', 'cat__ExterQual', 'cat__BsmtQual',
       'cat__BsmtCond', 'cat__BsmtExposure', 'cat__HeatingQC',
       'cat__KitchenQual', 'cat__Functional', 'cat__FireplaceQu',
       'cat__GarageCond', 'con__MSSubClass', 'con__LotArea',
       'con__OverallQual', 'con__OverallCond', 'con__YearBuilt',
       'con__MasVnrArea', 'con__BsmtFinSF1', 'con__BsmtFinSF2',
       'con__BsmtUnfSF', 'con__TotalBsmtSF', 'con__1stFlrSF',
       'con__LowQualFinSF', 'con__GrLivArea', 'con__BsmtFullBath',
       'con__KitchenAbvGr', 'con__TotRmsAbvGrd', 'con__Fireplaces',
       'con__GarageCars', 'con__WoodDeckSF', 'con__ScreenPorch',
       'con__PoolArea', 'con__YrSold'], dtype=object)

In [55]:
sel_cols = []
for i in imp_cols:
    fea = i.split("_")[-1]
    sel_cols.append(fea)

In [56]:

sel_cols

['Street',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageCond',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [57]:

x_sel = x[sel_cols]
x_sel.head()

Unnamed: 0,Street,Utilities,Neighborhood,BldgType,HouseStyle,RoofMatl,Exterior1st,ExterQual,BsmtQual,BsmtCond,...,GrLivArea,BsmtFullBath,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,WoodDeckSF,ScreenPorch,PoolArea,YrSold
0,Pave,AllPub,CollgCr,1Fam,2Story,CompShg,VinylSd,Gd,Gd,TA,...,1710,1,1,8,0,2,0,0,0,2008
1,Pave,AllPub,Veenker,1Fam,1Story,CompShg,MetalSd,TA,Gd,TA,...,1262,0,1,6,1,2,298,0,0,2007
2,Pave,AllPub,CollgCr,1Fam,2Story,CompShg,VinylSd,Gd,Gd,TA,...,1786,1,1,6,1,2,0,0,0,2008
3,Pave,AllPub,Crawfor,1Fam,2Story,CompShg,Wd Sdng,TA,TA,Gd,...,1717,1,1,7,1,3,0,0,0,2006
4,Pave,AllPub,NoRidge,1Fam,2Story,CompShg,VinylSd,Gd,Gd,TA,...,2198,1,1,9,1,3,192,0,0,2008


In [58]:

x_sel.shape

(1460, 38)

# Data Preprocessing and Data Cleaning for final model building

In [59]:

cat_sel = list(x_sel.columns[x_sel.dtypes=="object"])
con_sel = list(x_sel.columns[x_sel.dtypes!="object"])

In [60]:
cat_pipe1 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)

In [61]:
con_pipe1 = make_pipeline(SimpleImputer(strategy="median"),StandardScaler())

In [62]:
pre1= ColumnTransformer([
    ("cat",cat_pipe1,cat_sel),
    ("con",con_pipe1,con_sel)
]).set_output(transform="pandas")


In [63]:
pre1

In [64]:

x_pre1 = pre1.fit_transform(x_sel)
x_pre1.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,cat__Neighborhood_BrDale,cat__Neighborhood_BrkSide,cat__Neighborhood_ClearCr,cat__Neighborhood_CollgCr,...,con__GrLivArea,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__ScreenPorch,con__PoolArea,con__YrSold
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.370333,1.10781,-0.211454,0.91221,-0.951226,0.311725,-0.752176,-0.270208,-0.068692,0.138777
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.482512,-0.819964,-0.211454,-0.318683,0.600495,0.311725,1.626195,-0.270208,-0.068692,-0.614439
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.515013,1.10781,-0.211454,-0.318683,0.600495,0.311725,-0.752176,-0.270208,-0.068692,0.138777
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.383659,1.10781,-0.211454,0.296763,0.600495,1.650307,-0.752176,-0.270208,-0.068692,-1.367655
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.299326,1.10781,-0.211454,1.527656,0.600495,1.650307,0.780197,-0.270208,-0.068692,0.138777


# Split the data into training and testing

In [65]:
xtrain ,xtest ,ytrain ,ytest = train_test_split(
    x_pre1,y,train_size=0.80, random_state=21
)

In [66]:
xtrain.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,cat__Neighborhood_BrDale,cat__Neighborhood_BrkSide,cat__Neighborhood_ClearCr,cat__Neighborhood_CollgCr,...,con__GrLivArea,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__ScreenPorch,con__PoolArea,con__YrSold
710,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.497169,-0.819964,-0.211454,-0.93413,-0.951226,-2.36544,-0.752176,-0.270208,-0.068692,0.138777
1098,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.364484,-0.819964,-0.211454,-0.318683,-0.951226,-1.026858,-0.752176,-0.270208,-0.068692,0.891994
1286,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.35687,1.10781,-0.211454,-0.318683,2.152216,0.311725,-0.752176,-0.270208,-0.068692,1.64521
992,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.638751,-0.819964,-0.211454,0.296763,0.600495,0.311725,1.155309,-0.270208,-0.068692,-0.614439
631,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.073361,-0.819964,-0.211454,-0.318683,0.600495,0.311725,0.492877,-0.270208,-0.068692,-0.614439


In [67]:
xtest.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,cat__Neighborhood_BrDale,cat__Neighborhood_BrkSide,cat__Neighborhood_ClearCr,cat__Neighborhood_CollgCr,...,con__GrLivArea,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__ScreenPorch,con__PoolArea,con__YrSold
880,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.809944,1.10781,-0.211454,-0.93413,-0.951226,0.311725,-0.752176,-0.270208,-0.068692,-0.614439
605,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.895747,-0.819964,-0.211454,0.296763,3.703938,0.311725,-0.752176,3.120637,-0.068692,0.891994
1166,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.339875,-0.819964,-0.211454,0.296763,-0.951226,1.650307,0.524802,-0.270208,-0.068692,1.64521
216,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.151273,1.10781,-0.211454,0.91221,-0.951226,0.311725,0.357198,-0.270208,-0.068692,0.138777
970,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.615769,-0.819964,-0.211454,-0.318683,-0.951226,-2.36544,-0.752176,-0.270208,-0.068692,-1.367655


In [68]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [69]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [70]:
model1.fit(xtrain,ytrain)

In [71]:
model1.score(xtrain,ytrain)

0.9234136079791515

In [72]:

model1.score(xtest,ytest)

0.819965503535727

In [73]:

model2 = Ridge(alpha=1)
model2.fit(xtrain,ytrain)

In [74]:
model2.score(xtrain,ytrain)

0.9104516274272434

In [75]:
model2.score(xtest,ytest)


0.8380363539935944


# Hyperparameter tuning

In [76]:

params = {"alpha": np.arange(start=0.1,stop=100,step=0.1)}
params

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
         1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
         2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
         3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
         4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
         5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
         6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
         7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
         8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
        10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
        11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
        13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
        14.4, 14.5, 14.6, 14.

# Ridge

In [77]:

base_model = Ridge()
gscv = GridSearchCV(estimator=base_model,param_grid=params,cv=5,scoring="r2")
gscv.fit(xtrain,ytrain)

In [78]:
gscv.best_params_

{'alpha': np.float64(7.8)}

In [79]:
gscv.best_score_

np.float64(0.8385565411255833)

In [80]:

best_ridge = gscv.best_estimator_

In [81]:
best_ridge.fit(xtrain,ytrain)

In [82]:
best_ridge.score(xtrain,ytrain)

0.8907842562806176

In [83]:
best_ridge.score(xtest,ytest)

0.836752497101852

# Lasso

In [84]:
base_model1 = Lasso()
gscv2 = GridSearchCV(estimator=base_model1,param_grid=params,scoring="r2",cv=5)
gscv2.fit(xtrain,ytrain)

In [85]:

gscv2.best_params_

{'alpha': np.float64(92.3)}

In [86]:

gscv2.best_score_

np.float64(0.8495421487884242)

In [87]:
best_lasso = gscv2.best_estimator_

In [88]:

best_lasso.fit(xtrain,ytrain)

In [89]:
best_lasso.score(xtrain,ytrain)

0.9162010187824222

In [90]:

best_lasso.score(xtest,ytest)

0.8259038641595365


# Model Evaluation

In [91]:
ypreds_train = best_ridge.predict(xtrain)
ypreds_test = best_ridge.predict(xtest)

In [92]:
mse1 = mean_squared_error(ytrain, ypreds_train)
mae1 = mean_absolute_error(ytrain, ypreds_train)
r2_1 = r2_score(ytrain, ypreds_train)
rmse1 = mse1 ** (1 / 2)
print(f"Traning Scores")
print(f"Mean Squared error: {mse1}")
print(f"Mean Absolute error: {mae1}")
print(f"Root Mean Squared error: {rmse1}")
print(f"R2 squared error: {r2_1}")

Traning Scores
Mean Squared error: 675928103.6414275
Mean Absolute error: 15909.191410711408
Root Mean Squared error: 25998.617340955414
R2 squared error: 0.8907842562806176


In [93]:

mse = mean_squared_error(ytest, ypreds_test)
mae = mean_absolute_error(ytest, ypreds_test)
r2 = r2_score(ytest, ypreds_test)
rmse = mse ** (1 / 2)
print(f"Testing Scores")
print(f"Mean Squared error: {mse}")
print(f"Mean Absolute error: {mae}")
print(f"Root Mean Squared error: {rmse}")
print(f"R2 squared error: {r2}")

Testing Scores
Mean Squared error: 1106379257.600668
Mean Absolute error: 17216.67459953093
Root Mean Squared error: 33262.279801611134
R2 squared error: 0.836752497101852


# Out of sample predictions: Considering Ridge model for final model predictions

In [95]:
url1 = "https://raw.githubusercontent.com/Sid-2862/Kaggle_competition/refs/heads/main/testhp.csv"
xnew = pd.read_csv(url1)
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [96]:
pre1

In [97]:

xnew_pre = pre1.transform(xnew)
xnew_pre.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,cat__Neighborhood_BrDale,cat__Neighborhood_BrkSide,cat__Neighborhood_ClearCr,cat__Neighborhood_CollgCr,...,con__GrLivArea,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__ScreenPorch,con__PoolArea,con__YrSold
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.179256,-0.819964,-0.211454,-0.93413,-0.951226,-1.026858,0.365179,1.882709,-0.068692,1.64521
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.354966,-0.819964,-0.211454,-0.318683,-0.951226,-1.026858,2.3844,-0.270208,-0.068692,1.64521
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.216136,-0.819964,-0.211454,-0.318683,0.600495,0.311725,0.939819,-0.270208,-0.068692,1.64521
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168544,-0.819964,-0.211454,0.296763,0.600495,0.311725,2.121024,-0.270208,-0.068692,1.64521
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.448246,-0.819964,-0.211454,-0.93413,-0.951226,0.311725,-0.752176,2.313293,-0.068692,1.64521


In [98]:
ypreds = best_ridge.predict(xnew_pre)
ypreds = ypreds.round(2)
ypreds

array([118828.39, 152355.85, 166195.92, ..., 175094.68, 102462.55,
       222892.77])

In [99]:
submission = xnew[["Id"]]
submission.head()

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465


In [100]:
submission["Sale Price Predictions"] = ypreds
submission.head()

Unnamed: 0,Id,Sale Price Predictions
0,1461,118828.39
1,1462,152355.85
2,1463,166195.92
3,1464,182573.4
4,1465,195666.94


In [101]:
submission.to_csv("SalesPrice Predicted results.csv",index=False)