In [115]:
# install dependencies
import pandas as pd
import seaborn as sns

In [116]:
df = pd.read_csv('Housing_new.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


In [117]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
count,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0,80545.0
mean,305232.0,16358.644795,2.998063,1.730821,1.731107,1.981166,0.501633,0.496455,0.500106,0.49792,0.49504,0.501757,0.002818,0.00221
std,399040.9,942.133777,1.411365,0.772322,0.77185,1.413513,0.5,0.499991,0.500003,0.499999,0.499979,0.5,0.053013,0.046958
min,274838.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,274838.0,16435.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,274838.0,16435.0,3.0,2.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,274838.0,16435.0,4.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
max,13300000.0,16435.0,6.0,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [118]:
df.shape

(80545, 14)

In [119]:
# display heatmap of correlation
#sns.heatmap(df.corr(), annot=True)

In [120]:
# import standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# prepocess categorical data
# one hot encoding
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


In [121]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
       'mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
       'airconditioning_yes', 'prefarea_yes',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

In [122]:
# split price and features
X = df.drop('price', axis=1)
y = df['price']


In [123]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define the models
ridge = Ridge()
lasso = Lasso()
elas = ElasticNet()
linear = LinearRegression()

# Create a pipeline for each model
ridge_pipe = Pipeline([('scaler', StandardScaler()), ('ridge', ridge)])
lasso_pipe = Pipeline([('scaler', StandardScaler()), ('lasso', lasso)])
elas_pipe = Pipeline([('scaler', StandardScaler()), ('elas', elas)])
linear_pipe = Pipeline([('scaler', StandardScaler()), ('linear', linear)])

# Define the hyperparameters for each model
ridge_params = {'ridge__alpha': [0.1, 0.5, 1.0, 1.5]}
lasso_params = {'lasso__alpha': [0.1, 0.5, 1.0, 1.5]}
elas_params = {'elas__alpha': [0.1, 0.5, 1.0, 1.5], 'elas__l1_ratio': [0.5, 0.7, 0.9]}
linear_params = {'linear__fit_intercept': [True, False]}

# Create a dictionary of models and their respective hyperparameters
models = {'Ridge Regression': (ridge_pipe, ridge_params),
          'Lasso Regression': (lasso_pipe, lasso_params),
          'Elastic Net Regression': (elas_pipe, elas_params),
          'Linear Regression': (linear_pipe, linear_params)}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty dictionary to store the fitted models
fitted_models = {}

# Train the models
for name, (model, params) in models.items():
    print(f'Training {name}...')
    model = GridSearchCV(model, params, cv=5, verbose=1)
    model.fit(X_train, y_train)
    fitted_models[name] = model
    print(f'{name} trained successfully.')


Training Ridge Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Ridge Regression trained successfully.
Training Lasso Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Lasso Regression trained successfully.
Training Elastic Net Regression...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Elastic Net Regression trained successfully.
Training Linear Regression...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Linear Regression trained successfully.


In [124]:
# import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model_metrics = pd.DataFrame(columns=['MAE', 'MSE', 'R2'])

for name, model in fitted_models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_metrics.loc[name] = [mae, mse, r2]

# print the model metrics dataframe
model_metrics

Unnamed: 0,MAE,MSE,R2
Ridge Regression,19708.471691,19617290000.0,0.816124
Lasso Regression,19705.763374,19618690000.0,0.816111
Elastic Net Regression,19522.140748,19016400000.0,0.821756
Linear Regression,19708.943138,19618860000.0,0.81611
