# Regression Project

In [138]:
from warnings import filterwarnings
filterwarnings('ignore')

In [139]:
import pandas as pd

In [140]:
url = (
    "https://raw.githubusercontent.com/Sid-2862/datasets/refs/heads/main/training_set.csv")
df = pd.read_csv(url,na_values=["","NA"],keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Checking for missing values

In [142]:
df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [143]:
df.duplicated().sum()

np.int64(0)

# Seperate x & y features

In [144]:
x = df.drop(columns=["Id","SalePrice"])
y = df[["SalePrice"]]

In [145]:
cat = list(x.columns[x.dtypes=="object"])
con = list(x.columns[x.dtypes!="object"])

In [146]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [147]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

## Data Preprocessing & Data Cleaning

In [148]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [149]:
num_pipe = make_pipeline(SimpleImputer(strategy="constant"),StandardScaler())

In [150]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

In [151]:
pre = ColumnTransformer([("con",num_pipe,con),("cat",cat_pipe,cat)]).set_output(
    transform="pandas"
)

In [152]:
pre

In [153]:
x_pre = pre.fit_transform(x)
x_pre.head()

Unnamed: 0,con__MSSubClass,con__LotFrontage,con__LotArea,con__OverallQual,con__OverallCond,con__YearBuilt,con__YearRemodAdd,con__MasVnrArea,con__BsmtFinSF1,con__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


## Feature Selection => Forward Selection

In [154]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [155]:
base_model = LinearRegression()
for_sel = SequentialFeatureSelector(
    base_model,direction="forward",n_features_to_select=10
)
for_sel.fit(x_pre,y)

In [156]:
imp_cols = for_sel.get_feature_names_out()
imp_cols

array(['con__MSSubClass', 'con__OverallQual', 'con__BsmtFinSF1',
       'con__GrLivArea', 'con__Fireplaces', 'con__GarageCars',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtExposure',
       'cat__KitchenQual'], dtype=object)

In [157]:
imp_cols[0]

'con__MSSubClass'

In [158]:
imp_cols[0].split("_")

['con', '', 'MSSubClass']

In [159]:
imp_cols[0].split("_")[-1]

'MSSubClass'

In [160]:
imp_features = []
for i in imp_cols:
    fea = i.split("_")[-1]
    imp_features.append(fea)

In [161]:
imp_features

['MSSubClass',
 'OverallQual',
 'BsmtFinSF1',
 'GrLivArea',
 'Fireplaces',
 'GarageCars',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'KitchenQual']

In [162]:
x_sel = x[imp_features]
x_sel.head(2)

Unnamed: 0,MSSubClass,OverallQual,BsmtFinSF1,GrLivArea,Fireplaces,GarageCars,ExterQual,BsmtQual,BsmtExposure,KitchenQual
0,60,7,706,1710,0,2,Gd,Gd,No,Gd
1,20,6,978,1262,1,2,TA,Gd,Gd,TA


## Data Preprocessing 2 Stage

In [163]:
x_sel_cat = list(x_sel.columns[x_sel.dtypes=="object"])
x_sel_con = list(x_sel.columns[x_sel.dtypes!="object"])

In [164]:
cat_sel_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False),
)

In [165]:
con_sel_pipe = make_pipeline(SimpleImputer(strategy="median"),StandardScaler())

In [166]:
pre_sel = ColumnTransformer(
    [("con",con_sel_pipe,x_sel_con),("cat",cat_sel_pipe,x_sel_cat)]
).set_output(transform="pandas")

In [167]:
pre_sel

In [168]:
x_sel_pre = pre_sel.fit_transform(x_sel)
x_sel_pre.head(2)

Unnamed: 0,con__MSSubClass,con__OverallQual,con__BsmtFinSF1,con__GrLivArea,con__Fireplaces,con__GarageCars,cat__ExterQual_Ex,cat__ExterQual_Fa,cat__ExterQual_Gd,cat__ExterQual_TA,...,cat__BsmtQual_Gd,cat__BsmtQual_TA,cat__BsmtExposure_Av,cat__BsmtExposure_Gd,cat__BsmtExposure_Mn,cat__BsmtExposure_No,cat__KitchenQual_Ex,cat__KitchenQual_Fa,cat__KitchenQual_Gd,cat__KitchenQual_TA
0,0.073375,0.651479,0.575425,0.370333,-0.951226,0.311725,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.071836,1.171992,-0.482512,0.600495,0.311725,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## split data into training and testing

In [169]:
from sklearn.model_selection import train_test_split
xtrain , xtest , ytrain , ytest = train_test_split(
    x_sel_pre , y , train_size= 0.8, random_state=21
)

In [170]:
xtrain.head()

Unnamed: 0,con__MSSubClass,con__OverallQual,con__BsmtFinSF1,con__GrLivArea,con__Fireplaces,con__GarageCars,cat__ExterQual_Ex,cat__ExterQual_Fa,cat__ExterQual_Gd,cat__ExterQual_TA,...,cat__BsmtQual_Gd,cat__BsmtQual_TA,cat__BsmtExposure_Av,cat__BsmtExposure_Gd,cat__BsmtExposure_Mn,cat__BsmtExposure_No,cat__KitchenQual_Ex,cat__KitchenQual_Fa,cat__KitchenQual_Gd,cat__KitchenQual_TA
710,-0.636078,-2.241782,-0.973018,-1.497169,-0.951226,-2.36544,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1098,-0.163109,-1.518467,0.500854,-0.364484,-0.951226,-1.026858,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1286,-0.872563,-0.071836,0.274948,-0.35687,2.152216,0.311725,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
992,0.073375,-0.071836,0.20257,0.638751,0.600495,0.311725,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
631,1.492282,1.374795,-0.92038,0.073361,0.600495,0.311725,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [171]:
xtest.head()

Unnamed: 0,con__MSSubClass,con__OverallQual,con__BsmtFinSF1,con__GrLivArea,con__Fireplaces,con__GarageCars,cat__ExterQual_Ex,cat__ExterQual_Fa,cat__ExterQual_Gd,cat__ExterQual_TA,...,cat__BsmtQual_Gd,cat__BsmtQual_TA,cat__BsmtExposure_Av,cat__BsmtExposure_Gd,cat__BsmtExposure_Mn,cat__BsmtExposure_No,cat__KitchenQual_Ex,cat__KitchenQual_Fa,cat__KitchenQual_Gd,cat__KitchenQual_TA
880,-0.872563,-0.795151,1.176379,-0.809944,-0.951226,0.311725,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
605,0.073375,0.651479,0.022723,0.895747,3.703938,0.311725,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1166,-0.872563,1.374795,-0.973018,0.339875,-0.951226,1.650307,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
216,-0.872563,0.651479,1.101808,-0.151273,-0.951226,0.311725,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
970,-0.163109,-1.518467,-0.973018,-0.615769,-0.951226,-2.36544,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [172]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [173]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


## Build the Model 

### Ridge

In [174]:
from sklearn.linear_model import Ridge , Lasso

In [175]:
base_model = Ridge(alpha=1)
base_model.fit(xtrain,ytrain)

## Evaluation

In [176]:
base_model.score(xtrain,ytrain)

0.839368537159981

In [177]:
base_model.score(xtest,ytest)

0.7944371431346918

## Hyperparameter tuning

In [178]:
import numpy as np
params = {"alpha":np.arange(start=0.1,stop=100,step=0.1)}
params

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
         1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
         2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
         3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
         4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
         5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
         6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
         7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
         8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
        10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
        11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
        13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
        14.4, 14.5, 14.6, 14.

In [179]:
from sklearn.model_selection import GridSearchCV

In [180]:
model1 = Ridge()
gscv = GridSearchCV(estimator=model1,param_grid=params,cv=5,scoring="r2")
gscv.fit(xtrain,ytrain)

In [181]:
gscv.scoring

'r2'

In [182]:
gscv.best_params_

{'alpha': np.float64(16.3)}

In [183]:
gscv.best_score_

np.float64(0.8207561186155103)

In [184]:
best_ridge = gscv.best_estimator_

In [185]:
best_ridge.fit(xtrain,ytrain)

In [186]:
best_ridge.score(xtrain,ytrain)

0.8381317071817362

In [187]:
best_ridge.score(xtest,ytest)

0.7971245281733258

In [188]:
from sklearn.model_selection import cross_val_score

In [189]:
scores = cross_val_score(best_ridge,xtrain,ytrain,cv=5,scoring="r2")

In [190]:
scores

array([0.63672531, 0.83892538, 0.86709148, 0.86344607, 0.89759236])

In [191]:
score = scores.mean()
score

np.float64(0.8207561186155103)

### Lasso

In [192]:
base_model2 = Lasso(alpha=1)
base_model2.fit(xtrain,ytrain)


In [193]:
base_model2.score(xtrain,ytrain)

0.8393788939011393

In [194]:
base_model2.score(xtest,ytest)

0.7941055583221581

In [195]:
model2 = Lasso()
gscv2 = GridSearchCV(estimator=model2,param_grid=params,cv=5,scoring="r2")

In [196]:
gscv2.fit(xtrain,ytrain)

In [197]:
gscv2.best_params_

{'alpha': np.float64(26.400000000000002)}

In [198]:
gscv2.best_score_

np.float64(0.8189563109443521)

## Ridge model is perfroming better than lasso model by 83%, Lets Consider Ridge Model For final predictions

In [199]:
url2 = ("https://raw.githubusercontent.com/Sid-2862/datasets/refs/heads/main/testing_set.csv")
xnew = pd.read_csv(url2,na_values=["","NA"],keep_default_na=False )

In [200]:
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [201]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,con__MSSubClass,con__LotFrontage,con__LotArea,con__OverallQual,con__OverallCond,con__YearBuilt,con__YearRemodAdd,con__MasVnrArea,con__BsmtFinSF1,con__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.872563,0.645747,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57075,0.053428,0.604293,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.674605,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.027027,1.051363,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,0.0,8.0,4.0
2,0.073375,0.472599,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57075,0.761852,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.073375,0.588031,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.460051,0.347326,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
4,1.492282,-0.421999,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57075,-0.39619,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [202]:
prices_predicted = best_ridge.predict(xnew_pre)
prices_predicted_predicted = prices_predicted_predicted.round(2)
prices_predicted_predicted

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cat__Alley
- cat__BldgType
- cat__BsmtCond
- cat__BsmtExposure
- cat__BsmtFinType1
- ...
Feature names seen at fit time, yet now missing:
- cat__BsmtExposure_Av
- cat__BsmtExposure_Gd
- cat__BsmtExposure_Mn
- cat__BsmtExposure_No
- cat__BsmtQual_Ex
- ...
