# Surprise Housing 

## Regression model using Advanced regression, Ridge and Lasso Regression and Regularization:

##### - Varun Mohite (Jan 2024 program - IIITB ML & AI)

### This assignment is a programming assignment wherein I have built a advanced linear regression model for the prediction of demand for shared bikes for a company BoomBikes.

### The solution is divided into the following sections:

1. Data understanding and exploration
2. Data cleaning and preparation
3. Model building and evaluation
4. Ridge, Lasso and Polynomial regression
5. Residual Analysis


In [2]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [3]:
# Loading the dataset
data = pd.read_csv('https://ml-course3-upgrad.s3.amazonaws.com/Assignment_+Advanced+Regression/train.csv')

In [4]:
# Data Preprocessing
# Checking for missing values
print(data.isnull().sum())

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [7]:
# Filling missing values if any
data = data.fillna(data.mean())

  data = data.fillna(data.mean())


In [None]:
# Plotting correlations on a heatmap

corr_matrix = df.corr()
plt.figure(figsize=(16,8))

#sns.heatmap(cor, cmap="YlGnBu", annot=True)
#plt.show()

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of All Variables')
plt.show()

In [10]:
# Splitting the data into features and target variable
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [11]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Ridge Regression
ridge = Ridge()
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_cv = GridSearchCV(ridge, ridge_params, cv=5)
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
# Best Ridge model
ridge_best = ridge_cv.best_estimator_
y_pred_ridge = ridge_best.predict(X_test_scaled)

In [None]:
# Ridge Regression Performance
print("Ridge Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))
print("Ridge Regression R²:", r2_score(y_test, y_pred_ridge))

In [None]:
# Lasso Regression
lasso = Lasso()
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_cv = GridSearchCV(lasso, lasso_params, cv=5)
lasso_cv.fit(X_train_scaled, y_train)

In [None]:
# Best Lasso model
lasso_best = lasso_cv.best_estimator_
y_pred_lasso = lasso_best.predict(X_test_scaled)

In [None]:
# Lasso Regression Performance
print("Lasso Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lasso)))
print("Lasso Regression R²:", r2_score(y_test, y_pred_lasso))

In [None]:
# Polynomial Regression
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train_scaled)
X_poly_test = poly.transform(X_test_scaled)
poly_model = Pipeline([('poly', PolynomialFeatures(degree=2)), 
                       ('linear', Ridge())])
poly_model.fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(X_poly_test)

In [None]:
# Polynomial Regression Performance
print("Polynomial Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_poly)))
print("Polynomial Regression R²:", r2_score(y_test, y_pred_poly))


In [None]:
# Model Evaluation and Visualization
# Residuals for Ridge
residuals_ridge = y_test - y_pred_ridge
plt.figure(figsize=(10, 6))
sns.histplot(residuals_ridge, kde=True)
plt.title("Residuals of Ridge Regression")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Residuals for Lasso
residuals_lasso = y_test - y_pred_lasso
plt.figure(figsize=(10, 6))
sns.histplot(residuals_lasso, kde=True)
plt.title("Residuals of Lasso Regression")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Residuals for Polynomial Regression
residuals_poly = y_test - y_pred_poly
plt.figure(figsize=(10, 6))
sns.histplot(residuals_poly, kde=True)
plt.title("Residuals of Polynomial Regression")
plt.xlabel("Residuals")
plt.ylabel("Frequency &#8203;``【oaicite:0】``&#8203;