In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error,mean_squared_error , r2_score


In [6]:
#Loding the data
housing= pd.read_csv("/Users/sumanshrestha/Documents/AI Class Omdena/machine-learning-introduction-makaisuman/data/boston_housing.csv")

In [7]:
#Since CAT.MEDV is not required removing it
housing= housing.drop(columns=['CAT. MEDV'])

In [8]:
#checking the data information
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  LSTAT    506 non-null    float64
 12  MEDV     506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 51.5 KB


In [9]:
# Converting all the data to float
# we need to convert the CHAS, RAD and TAX

housing["CHAS"] =housing["CHAS"].astype(float)  # for CHAS
housing["RAD"] =housing["RAD"].astype(float) # for RAD
housing["TAX"] =housing["TAX"].astype(float)  #For TAX

In [10]:
# Creating new features Room Age ratio
housing["Room_Age_Ratio"] = housing["RM"] / housing["AGE"] 

In [11]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CRIM            506 non-null    float64
 1   ZN              506 non-null    float64
 2   INDUS           506 non-null    float64
 3   CHAS            506 non-null    float64
 4   NOX             506 non-null    float64
 5   RM              506 non-null    float64
 6   AGE             506 non-null    float64
 7   DIS             506 non-null    float64
 8   RAD             506 non-null    float64
 9   TAX             506 non-null    float64
 10  PTRATIO         506 non-null    float64
 11  LSTAT           506 non-null    float64
 12  MEDV            506 non-null    float64
 13  Room_Age_Ratio  506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [12]:
# Standarizing  the data
scaler = StandardScaler()
scaled_data= scaler.fit_transform(housing)
scaled_df = pd.DataFrame(scaled_data, columns=housing.columns)


In [13]:
# Split my data into the features (X) and target y
X = scaled_df.drop(columns = ['MEDV'])
y = scaled_df['MEDV']

In [14]:
# Split the data into the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
# Train the model
model = LinearRegression() # Innitialize the model
model.fit(X_train, y_train)

In [16]:
# Prediction
y_pred = model.predict(X_test)

In [17]:
# Evaluate with MAE and MSE
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred) 
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True 
print("MAE:",mae) 
print("MSE:",mse) 

MAE: 0.34002097506753326
MSE: 0.26886714533924727


# Performing hyperparameter tuning

### Ridge Regression

In [18]:
#Using Ridge Regression (L2 Regularization)
ridgeModel = Ridge(alpha=1.0)
ridgeModel.fit(X_train, y_train)
ridge_pred = ridgeModel.predict(X_test)

In [19]:
# Evaluating Rigie Model
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
print("MSE:",ridge_mse) 
print("R²:",ridge_r2) 

MSE: 0.2693165820499785
R²: 0.6899711791433153


### Lasso Regression (L1 Regularization)

In [20]:
lassoModel = Lasso(alpha=0.1)
lassoModel.fit(X_train, y_train)
lasso_pred = lassoModel.predict(X_test)

In [21]:
# Evaluating the Lasso Regression Model
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)
print("MSE:",lasso_mse) 
print("R²:",lasso_r2) 

MSE: 0.3218623760351635
R²: 0.6294821055548869


## Adding the another features Taxation impact

In [22]:
housing["Tax_Value_Ratio"] = housing["TAX"] / housing["MEDV"]

In [23]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CRIM             506 non-null    float64
 1   ZN               506 non-null    float64
 2   INDUS            506 non-null    float64
 3   CHAS             506 non-null    float64
 4   NOX              506 non-null    float64
 5   RM               506 non-null    float64
 6   AGE              506 non-null    float64
 7   DIS              506 non-null    float64
 8   RAD              506 non-null    float64
 9   TAX              506 non-null    float64
 10  PTRATIO          506 non-null    float64
 11  LSTAT            506 non-null    float64
 12  MEDV             506 non-null    float64
 13  Room_Age_Ratio   506 non-null    float64
 14  Tax_Value_Ratio  506 non-null    float64
dtypes: float64(15)
memory usage: 59.4 KB


In [24]:
# Standarizing  the data
scaler = StandardScaler()
scaled_data= scaler.fit_transform(housing)
scaled_df = pd.DataFrame(scaled_data, columns=housing.columns)


In [25]:
# Split my data into the features (X) and target y
X = scaled_df.drop(columns = ['MEDV'])
y = scaled_df['MEDV']

In [26]:
# Split the data into the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [27]:
# Train the model
model = LinearRegression() # Innitialize the model
model.fit(X_train, y_train)

In [28]:
# Prediction
y_pred = model.predict(X_test)

In [29]:
# Evaluate with MAE and MSE
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred) 
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True 
print("MAE:",mae) 
print("MSE:",mse) 

MAE: 0.3271480932036904
MSE: 0.24354791402405307


# Performing hyperparameter tuning

In [30]:
#Using Ridge Regression (L2 Regularization)
ridgeModel = Ridge(alpha=1.0)
ridgeModel.fit(X_train, y_train)
ridge_pred = ridgeModel.predict(X_test)


In [31]:
# Evaluating Rigie Model
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
print("MSE:",ridge_mse) 
print("R²:",ridge_r2) 

MSE: 0.24280500607060704
R²: 0.7204904757175293


# Lasso Regression (L1 Regularization)

In [33]:
lassoModel = Lasso(alpha=0.1)
lassoModel.fit(X_train, y_train)
lasso_pred = lassoModel.predict(X_test)

In [34]:
# Evaluating the Lasso Regression Model
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)
print("MSE:",lasso_mse) 
print("R²:",lasso_r2) 

MSE: 0.29873401166526753
R²: 0.6561067548035987


## Adding the another Distance-Weighted Accessibility and LSTAT Inverse:

In [35]:
# Distance-Weighted Accessibility
housing["Distance_Accessibility"] = housing["DIS"] / (housing["RAD"] + 1)  
# LSTAT Inverse
housing["LSTAT_Inverse"] = 1 / (housing["LSTAT"] + 0.01)  # Avoid division by zero

In [36]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CRIM                    506 non-null    float64
 1   ZN                      506 non-null    float64
 2   INDUS                   506 non-null    float64
 3   CHAS                    506 non-null    float64
 4   NOX                     506 non-null    float64
 5   RM                      506 non-null    float64
 6   AGE                     506 non-null    float64
 7   DIS                     506 non-null    float64
 8   RAD                     506 non-null    float64
 9   TAX                     506 non-null    float64
 10  PTRATIO                 506 non-null    float64
 11  LSTAT                   506 non-null    float64
 12  MEDV                    506 non-null    float64
 13  Room_Age_Ratio          506 non-null    float64
 14  Tax_Value_Ratio         506 non-null    fl

In [37]:
# Standarizing  the data
scaler = StandardScaler()
scaled_data= scaler.fit_transform(housing)
scaled_df = pd.DataFrame(scaled_data, columns=housing.columns)

In [38]:
# Split my data into the features (X) and target y
X = scaled_df.drop(columns = ['MEDV'])
y = scaled_df['MEDV']

In [39]:
# Split the data into the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [40]:
# Train the model
model = LinearRegression() # Innitialize the model
model.fit(X_train, y_train)

In [41]:
# Prediction
y_pred = model.predict(X_test)

In [42]:
# Evaluate with MAE and MSE
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred) 
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True 
print("MAE:",mae) 
print("MSE:",mse) 

MAE: 0.2590150332390734
MSE: 0.1749221643212919


### Ridge Regression

In [43]:
#Using Ridge Regression (L2 Regularization)
ridgeModel = Ridge(alpha=1.0)
ridgeModel.fit(X_train, y_train)
ridge_pred = ridgeModel.predict(X_test)
# Evaluating Rigie Model
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
print("MSE:",ridge_mse) 
print("R²:",ridge_r2) 

MSE: 0.17396449319303564
R²: 0.7997375197433674


### Lasso Regression (L1 Regularization)

In [44]:
lassoModel = Lasso(alpha=0.1)
lassoModel.fit(X_train, y_train)
lasso_pred = lassoModel.predict(X_test)
# Evaluating the Lasso Regression Model
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)
print("MSE:",lasso_mse) 
print("R²:",lasso_r2) 

MSE: 0.2099852214201584
R²: 0.7582715846952924
