In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_csv('house_data.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
missing_values = df.isnull().sum()
per_missing = (df.isnull().sum()/len(df)) * 100
total_missing = pd.concat([missing_values, per_missing], axis= 1)
total_missing.columns=['Missing', 'Percentage']
total_missing.sort_values('Percentage', ascending=False,inplace=True)
total_missing.head(20)

Unnamed: 0,Missing,Percentage
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
MasVnrType,872,59.726027
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageQual,81,5.547945
GarageFinish,81,5.547945
GarageType,81,5.547945


In [4]:
df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis =1, inplace= True)
numerical = df.select_dtypes(include=['int', 'float'])
categorical = df.select_dtypes(include=['object', 'category'])

for x in numerical:
    df[x].fillna(np.mean(df[x]),inplace =True)
    
for x in categorical:
    df[x].fillna(df[x].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]),inplace =True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace=True)


In [5]:
encoder = LabelEncoder()
categorical = df.select_dtypes(include=['object', 'category'])
for x in categorical:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,...,0,0,0,0,0,12,2008,8,4,250000


In [6]:
x = df.drop(['Id','SalePrice'],axis =1)
y = df['SalePrice']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

In [7]:
model1 = LinearRegression()
model1.fit(xtrain,ytrain)

In [8]:
pred1 = model1.predict(xtest)
combined1 = {'Observed':ytest, 'Predicted':pred1}
df2 = pd.DataFrame(combined1)
df2

Unnamed: 0,Observed,Predicted
160,162500,172548.623957
759,290000,293542.392454
474,251000,256940.669927
1123,118000,84219.163112
1024,287000,387876.077881
...,...,...
1210,189000,210216.248307
983,255900,254375.018045
229,192500,195475.028681
1137,94000,89835.098809


In [9]:
print(mean_absolute_error(ytest,pred1))

18802.467859207747


In [10]:
print(mean_absolute_percentage_error(ytest,pred1))

0.10504149065352206


In [11]:
print(r2_score(ytest,pred1))

0.8678047552856656


In [12]:
model2 = DecisionTreeRegressor()
model2.fit(xtrain,ytrain)

In [13]:
pred2 = model2.predict(xtest)
combined2 = {'Observed':ytest, 'Predicted':pred2}
df3 = pd.DataFrame(combined2)
df3

Unnamed: 0,Observed,Predicted
160,162500,165000.0
759,290000,372500.0
474,251000,236500.0
1123,118000,105900.0
1024,287000,325624.0
...,...,...
1210,189000,274970.0
983,255900,290000.0
229,192500,175900.0
1137,94000,84500.0


In [14]:
print(mean_absolute_error(ytest,pred2))
print(mean_absolute_percentage_error(ytest,pred2))
print(r2_score(ytest,pred2))

26670.59931506849
0.14786684527679758
0.7018227942148343


In [15]:
model3 = RandomForestRegressor()
model3.fit(xtrain,ytrain)
pred3 = model3.predict(xtest)
combined3 = {'Observed':ytest, 'Predicted':pred3}
df4 = pd.DataFrame(combined3)
df4

Unnamed: 0,Observed,Predicted
160,162500,166492.82
759,290000,315111.03
474,251000,257818.83
1123,118000,99878.50
1024,287000,362495.74
...,...,...
1210,189000,187871.45
983,255900,261831.83
229,192500,199324.21
1137,94000,89966.05


In [16]:
print(mean_absolute_error(ytest,pred3))
print(mean_absolute_percentage_error(ytest,pred3))
print(r2_score(ytest,pred3))

17141.17832191781
0.1004647720458285
0.8869398344507878


In [17]:
from sklearn.model_selection import GridSearchCV
param_gridq1 = {
    'fit_intercept':[True,False],
    'copy_X':[True,False],
    'n_jobs':[None, 1, 10, 100, 200, 500, -1],
    'positive':[True,False]
}
Lin_Grid_search = GridSearchCV(estimator=model1,param_grid=param_gridq1,cv=5)
Lin_Grid_search.fit(xtrain,ytrain)
print('Best parameters found:', Lin_Grid_search.best_params_)
print('Best cross- validation score:', Lin_Grid_search.best_score_)


Best parameters found: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}
Best cross- validation score: 0.7633395752393093


56 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
56 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_base.py", line 647, in fit
    self.coef_ = optimize.nnls(X, y)[

In [18]:
param_grid2 = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best','random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 50]}
tree_Grid_search = GridSearchCV(estimator=model2,param_grid=param_grid2,cv=5)
tree_Grid_search.fit(xtrain,ytrain)
print('Best parameters found:', tree_Grid_search.best_params_)
print('Best cross- validation score:', tree_Grid_search.best_score_)

7200 fits failed out of a total of 28800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\L

Best parameters found: {'criterion': 'poisson', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'best'}
Best cross- validation score: 0.7858570822275327
