## Imports

In [1]:
import pandas as pd
import time
from tqdm import tqdm

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## Load the Data

In this special case, the dataset is included as part of Scikit-Learn. Convenient.

In [2]:
california_housing = fetch_california_housing()

### Transform the Data

However, the data is not immediately loaded in a DataFrame format we are familiar with. So let's create a DataFrame with it. We also have to explicitly add the target variable to the DataFrame.

In [3]:
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
df['MEDV'] = california_housing.target

## Training & Test Split

In [5]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Initial Model Training

In [6]:
model = LinearRegression()
model = model.fit(X_train, y_train)

### Make Predictions

In [7]:
y_pred = model.predict(X_test)

## Evaluation

In [8]:
eval_df = X_test.copy()
eval_df["MEDV_actual"] = y_test
eval_df["MEDV_predicted"] = y_pred
eval_df["error"] = abs(eval_df["MEDV_actual"] - eval_df["MEDV_predicted"])
eval_df.sort_values(by="error")

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MEDV_actual,MEDV_predicted,error
3470,3.0526,24.0,4.528395,1.098765,3151.0,3.890123,34.29,-118.46,1.83300,1.833108,0.000108
3913,3.5313,31.0,4.730120,1.012048,1494.0,3.600000,34.20,-118.55,2.11800,2.118376,0.000376
16755,3.1958,39.0,4.665169,1.083146,1738.0,3.905618,37.71,-122.46,2.32100,2.320323,0.000677
3550,6.8643,25.0,6.291572,0.933941,1166.0,2.656036,34.22,-118.64,3.33700,3.336308,0.000692
10207,3.7500,52.0,6.018957,1.308057,609.0,2.886256,33.88,-117.92,2.32500,2.325694,0.000694
...,...,...,...,...,...,...,...,...,...,...,...
459,1.1696,52.0,2.436000,0.944000,1349.0,5.396000,37.87,-122.25,5.00001,1.539815,3.460195
10454,2.3846,22.0,5.152866,1.146497,334.0,2.127389,33.48,-117.66,5.00001,1.503888,3.496122
13361,3.9696,25.0,10.352941,1.205882,99.0,2.911765,34.01,-117.61,5.00000,1.477226,3.522774
19542,1.7679,39.0,5.000000,0.888889,22.0,2.444444,37.63,-120.92,4.50000,0.917906,3.582094


#### mean_absolute_error

In [9]:
mean_absolute_error(y_test, y_pred)

0.5430877513145164

#### mean_squared_error

In [10]:
mean_squared_error(y_test, y_pred)

0.5590854012179496

#### r-squared

In [11]:
r2_score(y_test, y_pred)

0.5965357164450474

### Evaluating Different Regression Models

In [14]:
linear_regression = LinearRegression()
decision_tree_regression = DecisionTreeRegressor()
random_forest_regression = RandomForestRegressor()
support_vector_regression = SVR()

regressors = [
    linear_regression,
    decision_tree_regression,
    random_forest_regression,
    support_vector_regression
]

model_metrics = []
for regressor in tqdm(regressors):
    
    # Train the regressor
    start_time = time.time()
    trained_model = regressor.fit(X_train, y_train)
    end_training_time = time.time()
    training_time_elapsed = end_training_time - start_time
    
    # Apply trained regressor to test set
    start_time = time.time()
    predictions = trained_model.predict(X_test)
    prediction_time = time.time()
    prediction_time_elapsed = prediction_time - start_time
    
    # Measure model performance
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Record model metrics
    model_metrics.append({
        "model": trained_model.__class__.__name__,
        "training_time": training_time_elapsed,
        "prediction_time": prediction_time_elapsed,
        "mse": mse,
        "mae": mae,
        "r2": r2
    })
    
# Print model metrics table
pd.DataFrame(model_metrics)

100%|██████████| 4/4 [00:24<00:00,  6.25s/it]


Unnamed: 0,model,training_time,prediction_time,mse,mae,r2
0,LinearRegression,0.01552,0.00496,0.559085,0.543088,0.596536
1,DecisionTreeRegressor,0.17167,0.001775,0.547092,0.470608,0.605191
2,RandomForestRegressor,10.485118,0.094279,0.270878,0.340364,0.804521
3,SVR,9.503671,4.710272,1.430769,0.889593,-0.032515
