In [49]:
import pandas as pd
import numpy as np 
import seaborn as sns
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold


In [33]:
df = pd.read_csv(r'C:\Users\visha\OneDrive\Scaler Academy\Datasets\Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [34]:
# defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

In [35]:
# applying the function to the housing variables list
category =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df[category] = df[category].apply(binary_map)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [36]:
df.furnishingstatus.value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [37]:
# also, drop the first column of the resulting df (since n-1 dummy vars suffice)
f_status = pd.get_dummies(df['furnishingstatus'], drop_first = True)
f_status.head()

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [38]:
# concat the dummy variable df with the main df
new_df = pd.concat([df, f_status], axis = 1)
new_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [39]:
# 'furnishingstatus' since we alreday have the dummy vars
new_df.drop(['furnishingstatus'], axis = 1, inplace = True)
new_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [40]:
# train-test 70-30 split
df_train, df_test = train_test_split(new_df, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

In [41]:
sc = StandardScaler()
numeric_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']
df_train[numeric_vars] = sc.fit_transform(df_train[numeric_vars])
df_test[numeric_vars] = sc.fit_transform(df_test[numeric_vars])

In [42]:
df_train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,-0.575831,-0.736734,0.092755,-0.575844,-0.911674,1,0,0,0,0,0.318635,0,0,1
19,2.254239,0.632894,0.092755,1.533738,0.219752,1,0,0,0,1,0.318635,1,1,0
159,0.386778,-0.955291,0.092755,1.533738,-0.911674,1,1,1,0,1,-0.848672,0,0,0
35,1.828458,0.914591,0.092755,1.533738,2.482604,1,0,0,0,1,1.485941,0,0,0
28,2.003961,1.37599,2.811204,1.533738,0.219752,1,0,1,1,0,1.485941,0,0,1


In [43]:
df_test.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
265,-0.195156,-0.978244,-0.058038,-0.565752,0.23637,1,0,0,0,0,-0.711709,1,1,0
54,1.292163,0.321253,-0.058038,1.218544,0.23637,1,1,0,0,1,0.443937,0,1,0
171,0.232316,2.099313,-0.058038,-0.565752,-0.975026,1,0,0,0,0,0.443937,1,1,0
244,-0.120967,0.038029,-0.058038,-0.565752,0.23637,1,1,1,0,0,-0.711709,1,1,0
268,-0.205755,-0.116078,1.301706,-0.565752,0.23637,1,0,0,0,1,-0.711709,0,1,0


In [44]:
y_train = df_train.pop('price')
X_train = df_train

y_test = df_test.pop('price')
X_test = df_test

In [53]:
lm = LinearRegression()
lm.fit(X_train, y_train)

#eleminate Non usefull variables using rfe
rfe = RFE(lm, n_features_to_select=10)             
rfe = rfe.fit(X_train, y_train)

# evaluate the model using r2 score
y_pred = rfe.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.6544464733320661


In [47]:
#False means variable not include for model building
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', False, 3),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', True, 1),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', False, 2),
 ('prefarea', True, 1),
 ('semi-furnished', False, 4),
 ('unfurnished', True, 1)]

In [54]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, n_features_to_select=12)             
rfe = rfe.fit(X_train, y_train)

# predict prices of X_test
y_pred = rfe.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.6761921009777221


In [55]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', True, 1),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', True, 1),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', True, 1),
 ('prefarea', True, 1),
 ('semi-furnished', False, 2),
 ('unfurnished', True, 1)]

In [56]:
cross_val_score(rfe ,X_train, y_train, cv = 5 )

array([0.67916446, 0.6937411 , 0.67618229, 0.61320101, 0.59720674])

In [56]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

In [57]:
hyper_params = [{'n_features_to_select': list(range(6, 12))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm)             

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=RFE(estimator=LinearRegression()),
             param_grid=[{'n_features_to_select': [6, 7, 8, 9, 10, 11]}],
             return_train_score=True, scoring='r2', verbose=1)

In [58]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_features_to_select,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.009227,0.000828,0.002403,0.000377,6,{'n_features_to_select': 6},0.51696,0.641059,0.566658,0.525232,...,0.531121,0.076571,6,0.633174,0.609031,0.60848,0.626895,0.570131,0.609542,0.021974
1,0.007105,0.000742,0.002103,0.000488,7,{'n_features_to_select': 7},0.515062,0.65392,0.561929,0.534141,...,0.561229,0.048709,5,0.64055,0.62002,0.614256,0.6386,0.632325,0.62915,0.010339
2,0.005962,0.000876,0.0019,0.000801,8,{'n_features_to_select': 8},0.55956,0.649045,0.585581,0.553356,...,0.581107,0.035789,4,0.671152,0.623158,0.665395,0.643505,0.666843,0.65401,0.018168
3,0.005401,0.000491,0.002304,0.0004,9,{'n_features_to_select': 9},0.593407,0.715944,0.594966,0.609818,...,0.619107,0.049252,3,0.676434,0.653627,0.67603,0.670737,0.671307,0.669627,0.008335
4,0.00546,0.000537,0.002016,2.8e-05,10,{'n_features_to_select': 10},0.598021,0.718125,0.598331,0.621352,...,0.628562,0.045576,2,0.68069,0.656889,0.679401,0.673486,0.680071,0.674107,0.008986
5,0.005011,0.000551,0.0022,0.0004,11,{'n_features_to_select': 11},0.599566,0.714858,0.613227,0.634492,...,0.637298,0.040483,1,0.693364,0.670373,0.686756,0.684109,0.68187,0.683294,0.007524


In [63]:
hyper_params = [{'n_features_to_select': list(range(6, 12))}]

In [64]:
hyper_params

[{'n_features_to_select': [6, 7, 8, 9, 10, 11]}]

In [78]:
en = ElasticNet(alpha = 0.01)
en.fit(X_train, y_train)
rfe = RFE(en) 

In [77]:
folds

KFold(n_splits=5, random_state=100, shuffle=True)

In [79]:
# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=RFE(estimator=ElasticNet(alpha=0.01)),
             param_grid=[{'n_features_to_select': [6, 7, 8, 9, 10, 11]}],
             return_train_score=True, scoring='r2', verbose=1)

In [76]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_features_to_select,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.007977,0.000837,0.001902,0.000199,6,{'n_features_to_select': 6},0.516124,0.60784,0.482331,0.603608,...,0.562153,0.052525,6,0.612856,0.590735,0.574872,0.605682,0.598883,0.596606,0.013103
1,0.006103,0.000807,0.001518,0.00055,7,{'n_features_to_select': 7},0.522624,0.608441,0.495313,0.606026,...,0.566428,0.047788,5,0.617788,0.594306,0.582767,0.613743,0.609385,0.603598,0.0131
2,0.005209,0.00075,0.0014,0.000489,8,{'n_features_to_select': 8},0.52899,0.612701,0.494976,0.60562,...,0.568261,0.047391,4,0.618614,0.59783,0.588115,0.616545,0.612154,0.606652,0.01177
3,0.004647,0.000372,0.001716,0.000623,9,{'n_features_to_select': 9},0.532831,0.616145,0.494958,0.610806,...,0.571135,0.048493,3,0.620159,0.599393,0.590797,0.618985,0.61262,0.608391,0.011481
4,0.004309,0.000253,0.0019,0.000199,10,{'n_features_to_select': 10},0.533705,0.616458,0.497768,0.610806,...,0.571948,0.04754,2,0.621367,0.600037,0.594789,0.618985,0.612661,0.609568,0.010457
5,0.003807,0.000516,0.001405,0.000588,11,{'n_features_to_select': 11},0.533995,0.616458,0.497768,0.610806,...,0.572006,0.047494,1,0.62196,0.600037,0.594789,0.618985,0.612661,0.609686,0.010592


In [87]:
hyper_params = [{'alpha': [0.01, 0.05, 0.1, 0.5, 1]}]

In [88]:
en = Ridge()


In [89]:
# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = en, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

In [90]:
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=Ridge(),
             param_grid=[{'alpha': [0.01, 0.05, 0.1, 0.5, 1]}],
             return_train_score=True, scoring='r2', verbose=1)

In [91]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.002991,0.0003,0.001706,0.000399,0.01,{'alpha': 0.01},0.599309,0.713072,0.613245,0.627437,...,0.634877,0.040216,5,0.694368,0.672234,0.688467,0.686242,0.683746,0.685011,0.007292
1,0.002413,0.000476,0.001705,0.000395,0.05,{'alpha': 0.05},0.599324,0.713054,0.613196,0.627622,...,0.634924,0.040198,4,0.694367,0.672234,0.688467,0.686242,0.683745,0.685011,0.007292
2,0.001992,0.000293,0.0015,0.000628,0.1,{'alpha': 0.1},0.599341,0.713032,0.613135,0.627849,...,0.634983,0.040176,3,0.694367,0.672233,0.688466,0.686241,0.683745,0.68501,0.007292
3,0.002002,2e-06,0.001499,0.000446,0.5,{'alpha': 0.5},0.599465,0.712843,0.612645,0.629558,...,0.635416,0.040013,2,0.694344,0.672211,0.68844,0.686209,0.683722,0.684986,0.007291
4,0.001595,0.000482,0.001424,0.000214,1.0,{'alpha': 1},0.599582,0.712584,0.612029,0.631448,...,0.635878,0.039841,1,0.694281,0.672149,0.688366,0.68612,0.683656,0.684914,0.007289


In [92]:
model_cv.best_estimator_

Ridge(alpha=1)

In [93]:
y_hat = model_cv.predict(X_test)