## HOUSE PRICE DATA

## Project Objective
The objective of this project is to build a regression model to predict house prices 
based on various features such as location, size, and other property-related attributes.


## FILE READINGS

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [105]:
import warnings
warnings.filterwarnings("ignore")

In [106]:
path = r"D:\ML\training_set.csv"

In [107]:
df = pd.read_csv(path)

## Basic Data Quality Checks

In [108]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [110]:
n = df.isnull().sum()
n[n>0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [111]:
df.duplicated().sum()

np.int64(0)

## SEPRATION OF X & Y

In [112]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [113]:
x = df.drop(['Id', 'SalePrice'], axis=1)
y = df['SalePrice']

In [114]:
x.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [115]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

## CATEGORICAL & NUMERICAL DATA SEPERATION

In [116]:
cat = x.columns[x.dtypes == 'object']
num = x.columns[x.dtypes != 'object']

In [117]:
print(cat)
print("----------------")
print(num)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
----------------
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'Bedr

## CREATE PIPELINE

In [118]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [119]:
num_pipeline1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),   
    ('scaler', StandardScaler())                        
])


cat_pipeline1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant' , fill_value ='Not_Available')),
    ('encoder', OrdinalEncoder())      
])

In [120]:
## Combine all Pipelines
pre =  ColumnTransformer([    ('num1' ,  num_pipeline1 , num  ) ,
                    ('cat1' , cat_pipeline1 , cat) ])

In [121]:
pre

0,1,2
,transformers,"[('num1', ...), ('cat1', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Available'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


## APPLY TO DATA

In [122]:
x_pre = pre.fit_transform(x)

In [123]:
x_pre

array([[ 0.07337496, -0.22937175, -0.20714171, ...,  1.        ,
         8.        ,  4.        ],
       [-0.87256276,  0.4519361 , -0.09188637, ...,  1.        ,
         8.        ,  4.        ],
       [ 0.07337496, -0.09311018,  0.07347998, ...,  1.        ,
         8.        ,  4.        ],
       ...,
       [ 0.30985939, -0.18395123, -0.14781027, ...,  3.        ,
         8.        ,  4.        ],
       [-0.87256276, -0.09311018, -0.08016039, ...,  1.        ,
         8.        ,  4.        ],
       [-0.87256276,  0.22483348, -0.05811155, ...,  1.        ,
         8.        ,  4.        ]], shape=(1460, 79))

In [124]:
x_pre = pd.DataFrame(x_pre , columns = x.columns)

In [125]:
x_pre.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


## FEATURE SELECTION METHOD

In [126]:
from sklearn.feature_selection   import SequentialFeatureSelector
from sklearn.linear_model  import LinearRegression
model  =  LinearRegression()
sel  =  SequentialFeatureSelector(estimator  =  model  ,  
                                n_jobs  = -1  ,
                                 cv  =  10  ,  
                                direction  =  'backward' )
 


In [127]:
sel.fit(x_pre , y)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,10
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [128]:
features =sel.get_feature_names_out()
features

array(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'RoofMatl', 'Exterior2nd',
       'MasVnrArea', 'ExterCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF2', 'TotalBsmtSF', 'Electrical', 'LowQualFinSF',
       'BsmtHalfBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Functional', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCond', 'OpenPorchSF', 'EnclosedPorch',
       '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscFeature', 'YrSold',
       'SaleCondition'], dtype=object)

In [129]:
a  =  ['Street', 'Utilities', 'Neighborhood', 'BldgType', 'HouseStyle',
       'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'PavedDrive', 'MiscFeature', 'SaleCondition', 'MSSubClass',
       'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea',
       'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars',
       'WoodDeckSF', 'ScreenPorch', 'PoolArea', 'YrSold']
b  =  ['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'MasVnrArea', 'ExterCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF',
       'Heating', 'Electrical', 'LowQualFinSF', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Functional', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCond', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscFeature',
       'YrSold', 'SaleCondition']
c =  []
for i in  a : 
    if  i  not  in b  :  
        c.append(i)

In [130]:
c

['Utilities',
 'Neighborhood',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'PavedDrive',
 'OverallCond',
 'YearBuilt',
 'BsmtUnfSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'PoolArea']

In [131]:
len (features)

40

In [132]:
x_final=  x[features]

In [133]:
x_final

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LandContour,LandSlope,Condition1,Condition2,BldgType,...,GarageFinish,GarageCond,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscFeature,YrSold,SaleCondition
0,60,65.0,8450,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,RFn,TA,61,0,0,0,0,,2008,Normal
1,20,80.0,9600,Pave,,Lvl,Gtl,Feedr,Norm,1Fam,...,RFn,TA,0,0,0,0,0,,2007,Normal
2,60,68.0,11250,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,RFn,TA,42,0,0,0,0,,2008,Normal
3,70,60.0,9550,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,Unf,TA,35,272,0,0,0,,2006,Abnorml
4,60,84.0,14260,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,RFn,TA,84,0,0,0,0,,2008,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,RFn,TA,40,0,0,0,0,,2007,Normal
1456,20,85.0,13175,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,Unf,TA,0,0,0,0,0,,2010,Normal
1457,70,66.0,9042,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,RFn,TA,60,0,0,0,0,Shed,2010,Normal
1458,20,68.0,9717,Pave,,Lvl,Gtl,Norm,Norm,1Fam,...,Unf,TA,0,112,0,0,0,,2010,Normal


In [134]:
from sklearn.pipeline import Pipeline

## 

## CREATE PIPELINE FOR THIS DATA

In [135]:
num_pipeline2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


cat_pipeline3 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Not_Available')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [136]:
x_final.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'RoofMatl', 'Exterior2nd', 'MasVnrArea',
       'ExterCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF2',
       'TotalBsmtSF', 'Electrical', 'LowQualFinSF', 'BsmtHalfBath', 'FullBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Functional',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCond',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscFeature', 'YrSold', 'SaleCondition'],
      dtype='object')

In [137]:
n = x_final.isnull().sum()

In [138]:
num2 = x_final.columns[x_final.dtypes != 'object']
cat3 = x_final.columns[x_final.dtypes == 'object']


In [139]:
print(num2)
print()
print(cat3)
print()



Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'MasVnrArea',
       'BsmtFinSF2', 'TotalBsmtSF', 'LowQualFinSF', 'BsmtHalfBath', 'FullBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'YrSold'],
      dtype='object')

Index(['Street', 'Alley', 'LandContour', 'LandSlope', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofMatl', 'Exterior2nd',
       'ExterCond', 'BsmtExposure', 'BsmtFinType1', 'Electrical', 'Functional',
       'GarageType', 'GarageFinish', 'GarageCond', 'MiscFeature',
       'SaleCondition'],
      dtype='object')



In [140]:
pre2 =  ColumnTransformer([    ('num2' ,  num_pipeline2 , num2  ) , 
                    ('cat3' , cat_pipeline3 , cat3) ,  
                    ])


In [141]:
pre2

0,1,2
,transformers,"[('num2', ...), ('cat3', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Available'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [142]:
x_final.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'RoofMatl', 'Exterior2nd', 'MasVnrArea',
       'ExterCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF2',
       'TotalBsmtSF', 'Electrical', 'LowQualFinSF', 'BsmtHalfBath', 'FullBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Functional',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCond',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscFeature', 'YrSold', 'SaleCondition'],
      dtype='object')

In [143]:
x_scaled  =  pre2.fit_transform(x_final)

In [144]:
x_scaled  =  x_scaled.toarray()

In [145]:
x_scaled  =  pd.DataFrame(x_scaled , columns  =  pre2.get_feature_names_out())

In [146]:

x_scaled.head()

Unnamed: 0,num2__MSSubClass,num2__LotFrontage,num2__LotArea,num2__OverallQual,num2__MasVnrArea,num2__BsmtFinSF2,num2__TotalBsmtSF,num2__LowQualFinSF,num2__BsmtHalfBath,num2__FullBath,...,cat3__MiscFeature_Not_Available,cat3__MiscFeature_Othr,cat3__MiscFeature_Shed,cat3__MiscFeature_TenC,cat3__SaleCondition_Abnorml,cat3__SaleCondition_AdjLand,cat3__SaleCondition_Alloca,cat3__SaleCondition_Family,cat3__SaleCondition_Normal,cat3__SaleCondition_Partial
0,0.073375,-0.220875,-0.207142,0.651479,0.514104,-0.288653,-0.459303,-0.120242,-0.241061,0.789741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.46032,-0.091886,-0.071836,-0.57075,-0.288653,0.466465,-0.120242,3.948809,0.789741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,-0.084636,0.07348,0.651479,0.325915,-0.288653,-0.313369,-0.120242,-0.241061,0.789741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.44794,-0.096897,0.651479,-0.57075,-0.288653,-0.687324,-0.120242,-0.241061,-1.026041,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.641972,0.375148,1.374795,1.366489,-0.288653,0.19968,-0.120242,-0.241061,0.789741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Train-Test Split

## Model Building
Data is split into training and testing sets to evaluate model performance.


In [147]:
from sklearn.model_selection  import train_test_split

In [148]:
x_train,  x_test  ,  y_train ,  y_test  =  train_test_split(x_scaled  ,  y  , 
                         test_size  =  0.2  , 
                         random_state  =  42)

In [149]:
print(x_train.shape  ,  y_train.shape) 
print(x_test.shape ,  y_test.shape  )

(1168, 144) (1168,)
(292, 144) (292,)


In [150]:

from  sklearn.linear_model import LinearRegression

In [151]:
linreg  =  LinearRegression()

In [152]:

linreg.fit(x_train ,  y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Model Evaluation

In [153]:
linreg.score(x_train ,  y_train)

0.8628084132703864

In [154]:
linreg.score(x_test  ,  y_test)

0.8096887896707345

## since testing accuracy is too less this is the scenario of overfitting


##  use  Ridge and lasso regression

In [155]:

from sklearn.linear_model import Ridge ,  Lasso 
from  sklearn.model_selection  import GridSearchCV

In [156]:

ridge  =  Ridge()
alpha  =  np.arange(start  =  0.5  , stop = 50  , step=  0.5)
para  =  {'alpha' :  alpha}
ridge_cv  =  GridSearchCV(ridge  ,  cv  =  5  , param_grid =  para ,  n_jobs  =  -1)

In [157]:
ridge.fit(x_train ,  y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [158]:
ridge.score(x_train ,  y_train)

0.8451252488384995

In [159]:
ridge.score(x_test  , y_test)

0.8101974237602794

In [160]:
ridge_cv.fit(x_train  ,  y_train)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': array([ 0.5, ..., 49. , 49.5])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(49.5)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [161]:
ridge_cv.best_params_

{'alpha': np.float64(49.5)}

In [162]:
ridge_cv.score(x_train , y_train)

0.8009809302932275

In [163]:
ridge_cv.score(x_test , y_test)

0.8082781457521532

## lasso regression

In [164]:
lasso  =  Lasso()
lasso.fit(x_train   , y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [165]:

lasso.score(x_train,  y_train)

0.8628024047860922

In [166]:
lasso.score(x_test  ,  y_test)

0.8189998360893711

In [167]:
lasso  =  Lasso()
para  = {'alpha' :  np.arange(start  =  0.5  , stop  = 100  , step  =  0.5 )}
lasso_cv   = GridSearchCV(lasso  , param_grid  = para , cv  =  5  , n_jobs  =  -1 )

In [168]:

lasso_cv.fit(x_train ,  y_train)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': array([ 0.5, ...  99.5])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(99.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [169]:
lasso_cv.score(x_train ,  y_train)

0.8480691707397264

In [170]:

lasso_cv.score(x_test  ,  y_test)

0.8221911333673323

## Model Evaluation

In [171]:
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error,
                             root_mean_squared_error,
                             r2_score)
from  sklearn.model_selection import cross_val_score

In [172]:
def evaluate(model  ,  x ,  y):
    pred  =  model.predict(x)
    cv =  cross_val_score(model  ,  x  , y  ,  cv =  10 )
    MSE  =  mean_squared_error(y ,  pred)
    RMSE  = root_mean_squared_error(y ,  pred)
    MAE  = mean_absolute_error(y , pred)
    MAPE  =  mean_absolute_percentage_error(y , pred)
    r2 =  r2_score(y, pred)
 
    print(f'cv ,  {cv.mean()}'  )
    print(f'MSE -->  {MSE}')
    print(f'RMSE -->  {RMSE}')
    print(f'MAE -->  {MAE}')
    print(f'MAPE -->  {MAPE}')
    print(f'r2 -->  {r2}')

## Model Evaluation

-linear

In [173]:
## Training performance
evaluate(linreg , x_train , y_train)

cv ,  0.6784836446619551
MSE -->  818286130.7294347
RMSE -->  28605.701017969037
MAE -->  20234.520067544057
MAPE -->  0.12172279798137538
r2 -->  0.8628084132703864


In [174]:
evaluate(linreg , x_test , y_test)

cv ,  0.5657958342924865
MSE -->  1459749064.110066
RMSE -->  38206.662561784506
MAE -->  25781.706821223295
MAPE -->  0.15937651886534507
r2 -->  0.8096887896707345


In [175]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(x_train, y_train)

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [176]:
## Evaluate Lasso
evaluate(lasso , x_train , y_train)

cv ,  0.6837745175753173
MSE -->  818286134.1252172
RMSE -->  28605.70107732403
MAE -->  20234.592505446006
MAPE -->  0.12172288063311376
r2 -->  0.8628084127010588


In [177]:
evaluate(lasso , x_test , y_test)

cv ,  0.5653043019574804
MSE -->  1389927961.7846298
RMSE -->  37281.73764438334
MAE -->  25479.810595360486
MAPE -->  0.15788424182396835
r2 -->  0.8187915449433869


In [178]:
evaluate(lasso_cv , x_train , y_train)

cv ,  0.73255698020845
MSE -->  906199085.3632116
RMSE -->  30103.140789014218
MAE -->  21118.58819092149
MAPE -->  0.1257733333740923
r2 -->  0.8480691707397264


In [179]:
evaluate(lasso_cv , x_test , y_test)

cv ,  0.6819453374203306
MSE -->  1363852009.6027915
RMSE -->  36930.36703856044
MAE -->  24550.343997081512
MAPE -->  0.1511177796263217
r2 -->  0.8221911333673323


## Ridge

In [180]:
evaluate(ridge , x_train, y_train)

cv ,  0.7273237198543606
MSE -->  923758255.8571893
RMSE -->  30393.39164781037
MAE -->  21026.652051185374
MAPE -->  0.12517175409184259
r2 -->  0.8451252488384995


In [181]:
evaluate(ridge , x_test , y_test)

cv ,  0.7178612108982904
MSE -->  1455847674.7231612
RMSE -->  38155.57200099563
MAE -->  25464.14059317802
MAPE -->  0.15640875564099432
r2 -->  0.8101974237602794


## Ridge grid serch cv

In [182]:
evaluate(ridge_cv , x_train, y_train)

cv ,  0.7516989434284161
MSE -->  1187059267.8011014
RMSE -->  34453.72647190869
MAE -->  22374.701248102163
MAPE -->  0.1308370243958335
r2 -->  0.8009809302932275


In [183]:
evaluate(ridge_cv , x_test , y_test)

cv ,  0.7976032249943416
MSE -->  1470569163.12777
RMSE -->  38348.00077093681
MAE -->  24217.47845552031
MAPE -->  0.14372217956792593
r2 -->  0.8082781457521532


-   amongst all  above  models lasso  grid search  cv  giving   - good  result  we will refer  this  model  for  future      prediction

In [184]:
lasso_cv

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': array([ 0.5, ...  99.5])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(99.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


## WORK ON UNSEEN DATA

In [185]:
path = r"D:\ML\testing_set.csv"

In [186]:
sample  =  pd.read_csv(path)

In [187]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [188]:
pre2

0,1,2
,transformers,"[('num2', ...), ('cat3', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Available'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [189]:
sample_scaled = pre2.fit_transform(sample)

In [190]:
sample_scaled

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 58360 stored elements and shape (1459, 134)>

In [None]:
sample_scaled =  sample_scaled.toarray()

In [192]:
sample_scaled

array([[-0.87471081,  0.56732969,  0.36392912, ...,  0.        ,
         1.        ,  0.        ],
       [-0.87471081,  0.61596272,  0.89786065, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.06135085,  0.27553157,  0.80964587, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.87471081,  4.45797141,  2.05514965, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64638939, -0.30806469,  0.12552719, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.06135085,  0.27553157, -0.03879049, ...,  0.        ,
         1.        ,  0.        ]], shape=(1459, 134))

In [193]:
sample_scaled =  pd.DataFrame(sample_scaled ,  columns  =  pre2.get_feature_names_out())

In [194]:
sample_scaled

Unnamed: 0,num2__MSSubClass,num2__LotFrontage,num2__LotArea,num2__OverallQual,num2__MasVnrArea,num2__BsmtFinSF2,num2__TotalBsmtSF,num2__LowQualFinSF,num2__BsmtHalfBath,num2__FullBath,...,cat3__MiscFeature_Gar2,cat3__MiscFeature_Not_Available,cat3__MiscFeature_Othr,cat3__MiscFeature_Shed,cat3__SaleCondition_Abnorml,cat3__SaleCondition_AdjLand,cat3__SaleCondition_Alloca,cat3__SaleCondition_Family,cat3__SaleCondition_Normal,cat3__SaleCondition_Partial
0,-0.874711,0.567330,0.363929,-0.751101,-0.563316,0.517537,-0.370716,-0.080483,-0.258160,-1.028720,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.874711,0.615963,0.897861,-0.054877,0.047057,-0.297689,0.639230,-0.080483,-0.258160,-1.028720,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.061351,0.275532,0.809646,-0.751101,-0.563316,-0.297689,-0.266784,-0.080483,-0.258160,0.773083,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.061351,0.470064,0.032064,-0.054877,-0.450284,-0.297689,-0.271303,-0.080483,-0.258160,0.773083,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.465443,-1.232092,-0.971808,1.337571,-0.563316,-0.297689,0.528520,-0.080483,-0.258160,0.773083,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.401505,-2.302019,-1.591330,-1.447325,-0.563316,-0.297689,-1.129871,-0.080483,-0.258160,-1.028720,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,2.401505,-2.302019,-1.599808,-1.447325,-0.563316,-0.297689,-1.129871,-0.080483,-0.258160,-1.028720,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,-0.874711,4.457971,2.055150,-0.751101,-0.563316,-0.297689,0.401995,-0.080483,-0.258160,-1.028720,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,0.646389,-0.308065,0.125527,-0.751101,-0.563316,-0.297689,-0.302935,-0.080483,3.706631,-1.028720,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [195]:
sample.shape

(1459, 80)

In [196]:
x_train.shape

(1168, 144)

In [197]:
x_test.shape

(292, 144)

In [198]:
sample_scaled.shape

(1459, 134)

- Use ordinal encoder because unseen data contain less columns


## Conclusion
This project helped me understand:
- Data preprocessing techniques
- Regression algorithms
- Model evaluation metrics
