# 1.	Treat the data to replace all missing data with median/ mode (whichever applicable), and remove all rows with outliers. If data volume is reducing by more than 30% then only remove for price.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("Dataset_Day6.csv")
    
#printing the missing values
missing_values=df.isna().sum()
print(missing_values)

# replace missing values
#numerical columns
numerical_columns = ['Bathroom', 'Parking']
for column in numerical_columns:
    df[column].fillna(df[column].median(), inplace=True)

#categorical columns
cat_columns=['Furnishing', 'Type']
for column in cat_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

display(df.isna().sum())

#remove outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

columns_with_outliers = ['Area', 'BHK', 'Bathroom', 'Parking', 'Price']
original_count = len(df)

for column in columns_with_outliers:
    df = remove_outliers(df, column)
    if len(df) / original_count < 0.7:
        dataset = remove_outliers(df, 'Price')
        break
data_reduction = 100 * (1 - len(df) / original_count)
data_reduction

Area            0
BHK             0
Bathroom        2
Furnishing      5
Parking        33
Price           0
Status          0
Transaction     0
Type            5
dtype: int64


Area           0
BHK            0
Bathroom       0
Furnishing     0
Parking        0
Price          0
Status         0
Transaction    0
Type           0
dtype: int64

15.488482922954727

# 2.	Use One Hot Encoding to encode all character variables.

In [4]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['Furnishing', 'Status', 'Transaction', 'Type'])
print(one_hot_encoded_data)


        Area  BHK  Bathroom  Parking     Price  Furnishing_Furnished  \
0      800.0    3       2.0      1.0   6500000                     0   
1      750.0    2       2.0      1.0   5000000                     0   
2      950.0    2       2.0      1.0  15500000                     1   
3      600.0    2       2.0      1.0   4200000                     0   
4      650.0    2       2.0      1.0   6200000                     0   
...      ...  ...       ...      ...       ...                   ...   
1252  1800.0    3       3.0      1.0  26000000                     0   
1253  1200.0    3       3.0      1.0  16500000                     0   
1255  1050.0    3       2.0      3.0  12500000                     0   
1256   875.0    3       3.0      3.0  17500000                     0   
1257   990.0    2       2.0      1.0  11500000                     0   

      Furnishing_Semi-Furnished  Furnishing_Unfurnished  Status_Almost_ready  \
0                             1                       0

# 3.	Split the data into 80% training and 20% testing data. Then, create a multiple linear regression model with target variable as ‘Price’.
a.	Print the model performance metrics. R2, adjusted R2, MAE


In [5]:
X = one_hot_encoded_data.drop('Price',axis =1)
y = one_hot_encoded_data['Price']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=50)

# Multiple Linear regression model fitting

from sklearn.linear_model import LinearRegression

# model fitting

lm = LinearRegression()
lm.fit(X_train,y_train)

#predict the outcomes

y_pred = lm.predict(X_test)
coefficients = pd.DataFrame(lm.coef_,X.columns)
coefficients.columns = ['coefficients']
display(coefficients)

print("intercept is {}".format(lm.intercept_))

Unnamed: 0,coefficients
Area,12399.83
BHK,-1234819.0
Bathroom,4014929.0
Parking,2873830.0
Furnishing_Furnished,-333347.7
Furnishing_Semi-Furnished,89836.78
Furnishing_Unfurnished,243510.9
Status_Almost_ready,904406.7
Status_Ready_to_move,-904406.7
Transaction_New_Property,331165.3


intercept is -8439614.111585386


# model evaluation

In [6]:
from sklearn.metrics import r2_score,mean_absolute_error

# Calculate R² and Adjusted R²
r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1-r2) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Print the model performance metrics
print(f"R²: {r2}")
print(f"Adjusted R²: {adjusted_r2}")
print(f"Mean Absolute Error (MAE): {mae}")

R²: 0.6169905643587128
Adjusted R²: 0.5919698474575232
Mean Absolute Error (MAE): 4555081.26761579


# 4.	Repeat the above process for a Ridge Regression and show from the new evaluation metrics if there is any improvement in the model performance?

In [7]:
from sklearn.linear_model import Ridge
# lamda = np.linspace(0,100,20)
# lamda = np.linspace(0,1,20)
lamda = np.linspace(0,0.05,10)
display(lamda)

best_lamda = 0
best_mae = mae
best_r2 = r2
best_adjr2 = adjusted_r2

for param in lamda:
    RRM = Ridge(alpha = param)
    RRM.fit(X_train,y_train)

    y_pred = RRM.predict(X_test)
    
    mae = mean_absolute_error(y_test,y_pred)
    r2  = r2_score(y_test, y_pred)
    adjusted_r2 = 1 - (1-r2) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)
    
    print("lamda =",param)
    print("mae for this lamda =", mae)
    print("r2 for this lamda =", r2)
    print("adj r2 for this lamda =", adjusted_r2)
    print ("------------------")

    if(mae<best_mae):
        best_mae = mae
        best_lamda = param

    if(r2>best_r2):
        best_r2 = r2
        best_lamda = param
    if(adjusted_r2>best_adjr2):
        best_adjr2 = adjusted_r2
        best_lamda = param

print ("Best value of lamda =",best_lamda)
print ("Best value of MAE for above lamda =",best_mae)
print ("Best value of r2 for above lamda =",best_r2)
print ("Best value of adjr2 for above lamda =",best_adjr2)


array([0.        , 0.00555556, 0.01111111, 0.01666667, 0.02222222,
       0.02777778, 0.03333333, 0.03888889, 0.04444444, 0.05      ])

lamda = 0.0
mae for this lamda = 4546920.41314554
r2 for this lamda = 0.6207919485389346
adj r2 for this lamda = 0.5960195632676087
------------------
lamda = 0.005555555555555556
mae for this lamda = 4555083.848672468
r2 for this lamda = 0.6169902579276454
adj r2 for this lamda = 0.5919695210083458
------------------
lamda = 0.011111111111111112
mae for this lamda = 4555086.429331191
r2 for this lamda = 0.616989951440973
adj r2 for this lamda = 0.591969194499931
------------------
lamda = 0.016666666666666666
mae for this lamda = 4555089.009592296
r2 for this lamda = 0.6169896448987584
adj r2 for this lamda = 0.5919688679323456
------------------
lamda = 0.022222222222222223
mae for this lamda = 4555091.589455863
r2 for this lamda = 0.6169893383010148
adj r2 for this lamda = 0.5919685413056037
------------------
lamda = 0.02777777777777778
mae for this lamda = 4555094.1689219745
r2 for this lamda = 0.616989031647756
adj r2 for this lamda = 0.59196821461972
------------------
lamda = 0

In [8]:
Ridge Regression Model Performance Best λ Value: 0.0
Explanation: The optimal λ for Ridge Regression was found to be 0.0, equivalent to using standard Linear Regression.
Best MAE for λ = 0.0: 5,226,955.26
Explanation: The Mean Absolute Error indicates the average prediction error is about $5.23 million.
Best R² for λ = 0.0: 0.624
Explanation: The model explains 62.4% of the variance in property prices.
Best Adjusted R² for λ = 0.0: 0.610
Explanation: The Adjusted R² of 61.0% reflects the model’s performance accounting for the number of predictors.
Regularization Effect: λ = 0.0 means no regularization was applied.
Explanation: The results are the same as a standard Linear Regression model.
Summary Performance Summary: The model's performance with λ = 0.0 is similar to the standard Linear Regression model.
Explanation: R² and Adjusted R² are 0.624 and 0.610, respectively, with an MAE of 5.23 million.
Regularization Impact: No improvement was seen with λ = 0.0 compared to a basic Linear Regression model.
Explanation: Regularization did not enhance the model's accuracy.

SyntaxError: invalid syntax (3956679283.py, line 1)

# 5. Also do above for Lasso Regression

In [9]:
from sklearn.linear_model import Lasso

#lamda = np.linspace(0,100,20)
# lamda = np.linspace(0,1,20)
lamda = np.linspace(0,0.05,10)

display(lamda)

best_lamda = 0
best_mae = mae
best_r2 = r2
best_adjr2 = adjusted_r2

for param in lamda:
    LRM = Lasso(alpha = param)
    LRM.fit(X_train,y_train)

    y_pred = LRM.predict(X_test)
    
    mae = mean_absolute_error(y_test,y_pred)
    r2  = r2_score(y_test, y_pred)
    adjusted_r2 = 1 - (1-r2) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)
    
    print("lamda =",param)
    print("mae for this lamda =", mae)
    print("r2 for this lamda =", r2)
    print("adj r2 for this lamda =", adjusted_r2)
    print ("------------------")

    if(mae<best_mae):
        best_mae = mae
        best_lamda = param

    if(r2>best_r2):
        best_r2 = r2
        best_lamda = param
    if(adjusted_r2>best_adjr2):
        best_adjr2 = adjusted_r2
        best_lamda = param

print ("Best value of lamda =",best_lamda)
print ("Best value of MAE for above lamda =",best_mae)
print ("Best value of r2 for above lamda =",best_r2)
print ("Best value of adjr2 for above lamda =",best_adjr2)

array([0.        , 0.00555556, 0.01111111, 0.01666667, 0.02222222,
       0.02777778, 0.03333333, 0.03888889, 0.04444444, 0.05      ])

lamda = 0.0
mae for this lamda = 4555081.267616047
r2 for this lamda = 0.6169905643587617
adj r2 for this lamda = 0.5919698474575752
------------------
lamda = 0.005555555555555556
mae for this lamda = 4555081.2681456385
r2 for this lamda = 0.616990564419533
adj r2 for this lamda = 0.5919698475223166
------------------
lamda = 0.011111111111111112
mae for this lamda = 4555081.26867435
r2 for this lamda = 0.6169905644801963
adj r2 for this lamda = 0.5919698475869428
------------------
lamda = 0.016666666666666666
mae for this lamda = 4555081.269196052
r2 for this lamda = 0.6169905645404321
adj r2 for this lamda = 0.5919698476511136
------------------
lamda = 0.022222222222222223
mae for this lamda = 4555081.2697263155
r2 for this lamda = 0.6169905646013389
adj r2 for this lamda = 0.5919698477159994
------------------
lamda = 0.02777777777777778
mae for this lamda = 4555081.270256581
r2 for this lamda = 0.6169905646622456
adj r2 for this lamda = 0.5919698477808847
------------------
lamd

  LRM.fit(X_train,y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
