In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold


In [3]:
df = pd.read_csv(r'C:\Users\visha\OneDrive\Scaler Academy\Datasets\Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
# defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

In [5]:
# applying the function to the housing variables list
category =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df[category] = df[category].apply(binary_map)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [6]:
df.furnishingstatus.value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [7]:
# also, drop the first column of the resulting df (since n-1 dummy vars suffice)
f_status = pd.get_dummies(df['furnishingstatus'], drop_first = True)
f_status.head()

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [8]:
# concat the dummy variable df with the main df
new_df = pd.concat([df, f_status], axis = 1)
new_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [9]:
# 'furnishingstatus' since we alreday have the dummy vars
new_df.drop(['furnishingstatus'], axis = 1, inplace = True)
new_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [10]:
# train-test 70-30 split
df_train, df_test = train_test_split(new_df, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

In [11]:
sc = StandardScaler()
numeric_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']
df_train[numeric_vars] = sc.fit_transform(df_train[numeric_vars])
df_test[numeric_vars] = sc.fit_transform(df_test[numeric_vars])

In [12]:
df_train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,-0.575831,-0.736734,0.092755,-0.575844,-0.911674,1,0,0,0,0,0.318635,0,0,1
19,2.254239,0.632894,0.092755,1.533738,0.219752,1,0,0,0,1,0.318635,1,1,0
159,0.386778,-0.955291,0.092755,1.533738,-0.911674,1,1,1,0,1,-0.848672,0,0,0
35,1.828458,0.914591,0.092755,1.533738,2.482604,1,0,0,0,1,1.485941,0,0,0
28,2.003961,1.37599,2.811204,1.533738,0.219752,1,0,1,1,0,1.485941,0,0,1


In [13]:
df_test.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
265,-0.195156,-0.978244,-0.058038,-0.565752,0.23637,1,0,0,0,0,-0.711709,1,1,0
54,1.292163,0.321253,-0.058038,1.218544,0.23637,1,1,0,0,1,0.443937,0,1,0
171,0.232316,2.099313,-0.058038,-0.565752,-0.975026,1,0,0,0,0,0.443937,1,1,0
244,-0.120967,0.038029,-0.058038,-0.565752,0.23637,1,1,1,0,0,-0.711709,1,1,0
268,-0.205755,-0.116078,1.301706,-0.565752,0.23637,1,0,0,0,1,-0.711709,0,1,0


In [14]:
y_train = df_train.pop('price')
X_train = df_train

y_test = df_test.pop('price')
X_test = df_test

In [15]:
lm = LinearRegression()
lm.fit(X_train, y_train)

#eleminate Non usefull variables using rfe
rfe = RFE(lm, n_features_to_select=10)             
rfe = rfe.fit(X_train, y_train)

# evaluate the model using r2 score
y_pred = rfe.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.6544464733320661


In [16]:
#False means variable not include for model building
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', False, 3),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', True, 1),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', False, 2),
 ('prefarea', True, 1),
 ('semi-furnished', False, 4),
 ('unfurnished', True, 1)]

In [17]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, n_features_to_select=12)             
rfe = rfe.fit(X_train, y_train)

# predict prices of X_test
y_pred = rfe.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.6761921009777221


In [18]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', True, 1),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', True, 1),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', True, 1),
 ('prefarea', True, 1),
 ('semi-furnished', False, 2),
 ('unfurnished', True, 1)]

In [19]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
cross_val_score(rfe ,X_train, y_train, cv = folds )

array([0.59603846, 0.71341684, 0.61359263, 0.6312411 , 0.62269298])