In [3]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load dataset
california = fetch_california_housing()

# Convert to DataFrame
df = pd.DataFrame(data=california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target  # Add target column

print(df.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [5]:
# Check for missing values
print(df.isnull().sum())


MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

# Create scaler
scaler = StandardScaler()

# Scale all features (excluding the target)
scaled_features = scaler.fit_transform(df.drop(columns=['MedHouseVal']))

# Convert back to DataFrame
df_scaled = pd.DataFrame(scaled_features, columns=california.feature_names)
df_scaled['MedHouseVal'] = df['MedHouseVal']  # Add target back

print(df_scaled.head())


     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  MedHouseVal  
0  -1.327835        4.526  
1  -1.322844        3.585  
2  -1.332827        3.521  
3  -1.337818        3.413  
4  -1.337818        3.422  


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Features and target
X = df_scaled.drop(columns=['MedHouseVal'])
y = df_scaled['MedHouseVal']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Linear Regression R² Score:", r2_score(y_test, y_pred_lr))
print("Linear Regression RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False))


Linear Regression R² Score: 0.575787706032451
Linear Regression RMSE: 0.7455813830127763




In [13]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree R² Score:", r2_score(y_test, y_pred_dt))
print("Decision Tree RMSE:", mean_squared_error(y_test, y_pred_dt, squared=False))


Decision Tree R² Score: 0.6228111330554302
Decision Tree RMSE: 0.7030445773467542




In [15]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest R² Score:", r2_score(y_test, y_pred_rf))
print("Random Forest RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))


Random Forest R² Score: 0.805024407701793
Random Forest RMSE: 0.5054678690929896




In [16]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("Gradient Boosting R² Score:", r2_score(y_test, y_pred_gb))
print("Gradient Boosting RMSE:", mean_squared_error(y_test, y_pred_gb, squared=False))


Gradient Boosting R² Score: 0.7756433164710084
Gradient Boosting RMSE: 0.5422167577867202




In [17]:
from sklearn.svm import SVR

svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)

y_pred_svr = svr.predict(X_test)

print("SVR R² Score:", r2_score(y_test, y_pred_svr))
print("SVR RMSE:", mean_squared_error(y_test, y_pred_svr, squared=False))


SVR R² Score: 0.7289407597956462
SVR RMSE: 0.595985286730253




In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Store models and predictions in a list
models = {
    'Linear Regression': y_pred_lr,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'Gradient Boosting': y_pred_gb,
    'SVR': y_pred_svr
}

# Initialize results list
results = []

# Calculate metrics
for name, preds in models.items():
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results.append({'Model': name, 'MSE': mse, 'MAE': mae, 'R²': r2})

# Create and print DataFrame
results_df = pd.DataFrame(results).sort_values(by='R²', ascending=False)
print(results_df)


               Model       MSE       MAE        R²
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.397763  0.728941
1      Decision Tree  0.494272  0.453784  0.622811
0  Linear Regression  0.555892  0.533200  0.575788
