# Regression

- ## RandomForestRegressor

In [29]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [30]:
data = fetch_california_housing()
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [31]:
df = pd.DataFrame(data.data, columns = data.feature_names)

In [32]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [33]:
df["target"] = data["target"]

In [34]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [35]:
df.shape

(20640, 9)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [37]:
m = df.drop("target", axis = 1)
n = df["target"]

In [38]:
# Splitting into to train test split
m_train, m_test, n_train, n_test = train_test_split(m, n, test_size = 0.3, random_state = 2)

In [39]:
m_train.shape

(14448, 8)

In [40]:
m_test.shape

(6192, 8)

In [41]:
n_train.shape

(14448,)

In [42]:
n_test.shape

(6192,)

In [43]:
# Train Model
RFR = RandomForestRegressor(random_state = 2)
RFR.fit(m_train, n_train)

In [44]:
predictions = RFR.predict(m_test)

# Predicted Prices
pd.Series(predictions)

0       2.826560
1       1.724240
2       0.934120
3       3.695781
4       3.679691
          ...   
6187    4.966939
6188    1.171870
6189    1.903490
6190    2.241140
6191    1.934160
Length: 6192, dtype: float64

In [45]:
# Actual Prices
pd.Series(n_test)

10385    2.78700
1943     2.07600
7490     0.96700
16889    3.53800
11416    3.90500
          ...   
16911    5.00001
15066    1.12500
19410    1.74100
10126    2.27000
18568    1.94500
Name: target, Length: 6192, dtype: float64

In [46]:
# Evaluate the model using the Mean Absolute Error - measures the average absolute difference between predicted and actual prices.
mae = mean_absolute_error(n_test, predictions)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.33474160749354026


- Interpretation:
- The mean absolute error is ~0.335 in model units which means that the model is off by $33,500.
- On average, each prediction is about 0.335 units away from the true value.
- Since each unit = $100,000, this becomes $33,500 average error.
- This is the mean magnitude of errors, ignoring direction.

In [47]:
r = n_test - predictions

In [48]:
r

10385   -0.039560
1943     0.351760
7490     0.032880
16889   -0.157781
11416    0.225309
           ...   
16911    0.033071
15066   -0.046870
19410   -0.162490
10126    0.028860
18568    0.010840
Name: target, Length: 6192, dtype: float64

In [49]:
np.abs(r)

10385    0.039560
1943     0.351760
7490     0.032880
16889    0.157781
11416    0.225309
           ...   
16911    0.033071
15066    0.046870
19410    0.162490
10126    0.028860
18568    0.010840
Name: target, Length: 6192, dtype: float64

In [54]:
np.mean(np.abs(r))

0.33474160749354026

In [51]:
c = np.abs(n_test - predictions)        #abs changes all the values to positive

In [52]:
c

10385    0.039560
1943     0.351760
7490     0.032880
16889    0.157781
11416    0.225309
           ...   
16911    0.033071
15066    0.046870
19410    0.162490
10126    0.028860
18568    0.010840
Name: target, Length: 6192, dtype: float64

In [53]:
c.mean()

0.33474160749354026