<a href="https://colab.research.google.com/github/Nithyasreecp/Indian-crime-hotspot-mapping-and-prediction/blob/main/MLO1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loads the California housing dataset,

Trains Linear Regression with 5 independent variables (Model 1),

Trains Linear Regression with 7 independent variables (Model 2),

Augments Model 1 by adding the 2 extra variables without re-training the original 5-var model using a residual-regression trick,

Trains simple ANNs (MLP) for 5 vars and 7 vars,

Compares models by RMSE and RÂ², prints coefficients, and plots actual vs predicted.

In [1]:
# Save this as cali_regression_comparison.py or paste into a Jupyter cell.
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

In [2]:
# 1) Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.copy()
# target column name in the dataset
df.rename(columns={'MedHouseVal': 'MedHouseVal'}, inplace=True)

In [3]:
# 2) Choose features
# dataset feature names: ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']
features_5 = ['MedInc','HouseAge','AveRooms','Population','Latitude']                # model 1
features_7 = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude']  # model 2 (adds AveBedrms, AveOccup)
target = 'MedHouseVal'

In [4]:
# 3) Train/test split (use same superset so splits align)
X = df[features_7]   # we keep superset so we can slice
y = df[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [5]:
# 4) Scaling
scaler5 = StandardScaler()
scaler7 = StandardScaler()

X5_train = scaler5.fit_transform(X_train[features_5])
X5_test  = scaler5.transform(X_test[features_5])

X7_train = scaler7.fit_transform(X_train[features_7])
X7_test  = scaler7.transform(X_test[features_7])

In [6]:
# 5) Linear Regression: Model 1 (5 vars)
lr1 = LinearRegression()
lr1.fit(X5_train, y_train)
pred_lr1 = lr1.predict(X5_test)

In [7]:

# 6) Linear Regression: Model 2 (7 vars)
lr2 = LinearRegression()
lr2.fit(X7_train, y_train)
pred_lr2 = lr2.predict(X7_test)

In [8]:
# 7) Metrics helper
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

results = []
results.append({
    'model': 'Linear_5vars',
    'features': features_5,
    'rmse': rmse(y_test, pred_lr1),
    'r2': r2_score(y_test, pred_lr1)
})
results.append({
    'model': 'Linear_7vars',
    'features': features_7,
    'rmse': rmse(y_test, pred_lr2),
    'r2': r2_score(y_test, pred_lr2)
})


In [9]:
# 8) Augment Model 1 with the 2 new variables WITHOUT retraining model1:
new_vars = ['AveBedrms','AveOccup']
# Scale the two new variables (fit on train)
scaler_new = StandardScaler()
X_new_train = scaler_new.fit_transform(X_train[new_vars])
X_new_test  = scaler_new.transform(X_test[new_vars])


In [10]:
# residuals from lr1 (on train)
pred_lr1_train = lr1.predict(X5_train)
residuals_train = y_train - pred_lr1_train

In [11]:
# fit small regressor on residuals using the new vars only
adj = LinearRegression()
adj.fit(X_new_train, residuals_train)

# combined prediction on test set
pred_adj_test = adj.predict(X_new_test)
pred_lr1_aug = pred_lr1 + pred_adj_test

results.append({
    'model': 'Linear_5vars_plus2_via_residuals',
    'features': features_5 + new_vars,
    'rmse': rmse(y_test, pred_lr1_aug),
    'r2': r2_score(y_test, pred_lr1_aug)
})


In [12]:
# 9) ANN models (use sklearn MLPRegressor)
mlp5 = MLPRegressor(hidden_layer_sizes=(64,32), activation='relu', max_iter=500, random_state=42)
mlp7 = MLPRegressor(hidden_layer_sizes=(64,32), activation='relu', max_iter=500, random_state=42)

mlp5.fit(X5_train, y_train)
pred_mlp5 = mlp5.predict(X5_test)

mlp7.fit(X7_train, y_train)
pred_mlp7 = mlp7.predict(X7_test)

results.append({
    'model': 'MLP_5vars',
    'features': features_5,
    'rmse': rmse(y_test, pred_mlp5),
    'r2': r2_score(y_test, pred_mlp5)
})
results.append({
    'model': 'MLP_7vars',
    'features': features_7,
    'rmse': rmse(y_test, pred_mlp7),
    'r2': r2_score(y_test, pred_mlp7)
})

In [13]:
# 10) Summarize results
results_df = pd.DataFrame(results).sort_values('rmse').reset_index(drop=True)
print("\n=== Comparison results (lower RMSE better) ===")
print(results_df[['model','features','rmse','r2']])


=== Comparison results (lower RMSE better) ===
                              model  \
0                         MLP_7vars   
1                         MLP_5vars   
2                      Linear_7vars   
3  Linear_5vars_plus2_via_residuals   
4                      Linear_5vars   

                                            features      rmse        r2  
0  [MedInc, HouseAge, AveRooms, AveBedrms, Popula...  0.605893  0.719854  
1  [MedInc, HouseAge, AveRooms, Population, Latit...  0.719227  0.605247  
2  [MedInc, HouseAge, AveRooms, AveBedrms, Popula...  0.798137  0.513875  
3  [MedInc, HouseAge, AveRooms, Population, Latit...  0.803563  0.507243  
4  [MedInc, HouseAge, AveRooms, Population, Latit...  0.805643  0.504689  
