In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

In [17]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [19]:
df['MedHouseVal'] = housing.target

In [21]:
df.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [23]:
# Feature Scaling using Standardisation
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print("Dataset after Standard Scaling:")
df_scaled

Dataset after Standard Scaling:


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,2.129631
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,1.314156
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,1.258693
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,1.165100
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,1.172900
...,...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826,-1.115804
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722,-1.124470
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713,-0.992746
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626,-1.058608


# Regression Algorithm Implementation

In [26]:
# Implement the following regression algorithms:
# Linear Regression
# Decision Tree Regressor
# Random Forest Regressor
# Gradient Boosting Regressor
# Support Vector Regressor (SVR)
#  For each algorithm:
# Provide a brief explanation of how it works.
# Explain why it might be suitable for this dataset.


In [32]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [36]:
#  Linear Regression

X = df.drop(columns=['MedHouseVal'])  # Features
y = df['MedHouseVal']  # Target
# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)
print('Linear Regression RMSE:', mean_squared_error(y_test, y_pred))

Linear Regression RMSE: 0.5558915986952441


In [40]:
# DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print('Decision Tree RMSE:', mean_squared_error(y_test, y_pred))

Decision Tree RMSE: 0.495235205629094


In [47]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Random Forest RMSE:", mean_squared_error(y_test, y_pred)

Random Forest RMSE: 0.2553684927247781


In [49]:
#  GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
print("Gradient Boosting RMSE:", mean_squared_error(y_test, y_pred))


Gradient Boosting RMSE: 0.2939973248643864


In [51]:
# Support Vector Regressor (SVR)

from sklearn.svm import SVR

svr = SVR(kernel='rbf')  # Radial Basis Function (RBF) kernel for non-linearity
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)
print("SVR RMSE:", mean_squared_error(y_test, y_pred))

SVR RMSE: 1.3320115421348737


# Model Evaluation and Comparison

In [56]:
# Function to evaluate model performance
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'{model_name} Performance:')
    print(f'Mean Squared Error (MSE):',mse)
    print(f'Mean Absolute Error (MAE):',mae)
    print(f'R-squared Score (R²):',r2)
    print('-' * 50)
    
    return mse, mae, r2

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'Support Vector Regressor (SVR)': SVR(kernel='rbf')
}

# Evaluate all models
results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test, name)

# Identify best and worst models
best_model = max(results, key=lambda x: results[x][2])  # Highest R² score
worst_model = min(results, key=lambda x: results[x][2])  # Lowest R² score

print(f'Best-performing model: {best_model}')
print(f'Worst-performing model: {worst_model}')

Linear Regression Performance:
Mean Squared Error (MSE): 0.5558915986952441
Mean Absolute Error (MAE): 0.5332001304956573
R-squared Score (R²): 0.575787706032451
--------------------------------------------------
Decision Tree Regressor Performance:
Mean Squared Error (MSE): 0.495235205629094
Mean Absolute Error (MAE): 0.45467918846899225
R-squared Score (R²): 0.622075845135081
--------------------------------------------------
Random Forest Regressor Performance:
Mean Squared Error (MSE): 0.2553684927247781
Mean Absolute Error (MAE): 0.32754256845930246
R-squared Score (R²): 0.8051230593157366
--------------------------------------------------
Gradient Boosting Regressor Performance:
Mean Squared Error (MSE): 0.2939973248643864
Mean Absolute Error (MAE): 0.3716425690425596
R-squared Score (R²): 0.7756446042829697
--------------------------------------------------
Support Vector Regressor (SVR) Performance:
Mean Squared Error (MSE): 1.3320115421348737
Mean Absolute Error (MAE): 0.85995