## **California Housing**
- Regression in machine learning is a type of supervised learning where the goal is to predict a continuous target variable based on one or more input features. Below, I'll teach you about regression using three different datasets in Python, each focusing on different regression problems and evaluation metrics.

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
from sklearn import datasets

In [3]:
dir(datasets)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_arff_parser',
 '_base',
 '_california_housing',
 '_covtype',
 '_kddcup99',
 '_lfw',
 '_olivetti_faces',
 '_openml',
 '_rcv1',
 '_samples_generator',
 '_species_distributions',
 '_svmlight_format_fast',
 '_svmlight_format_io',
 '_twenty_newsgroups',
 'clear_data_home',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_kddcup99',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_openml',
 'fetch_rcv1',
 'fetch_species_distributions',
 'get_data_home',
 'load_breast_cancer',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_checkerboard',
 'make_circl

In [4]:
data = datasets.fetch_california_housing()

In [5]:
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [6]:
california_housing = pd.DataFrame(data["data"], columns = data["feature_names"])

In [7]:
california_housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [8]:
california_housing["target"] = data["target"]

In [9]:
california_housing.shape

(20640, 9)

In [10]:
california_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [11]:
print(california_housing.to_string())

        MedInc  HouseAge    AveRooms  AveBedrms  Population     AveOccup  Latitude  Longitude   target
0       8.3252      41.0    6.984127   1.023810       322.0     2.555556     37.88    -122.23  4.52600
1       8.3014      21.0    6.238137   0.971880      2401.0     2.109842     37.86    -122.22  3.58500
2       7.2574      52.0    8.288136   1.073446       496.0     2.802260     37.85    -122.24  3.52100
3       5.6431      52.0    5.817352   1.073059       558.0     2.547945     37.85    -122.25  3.41300
4       3.8462      52.0    6.281853   1.081081       565.0     2.181467     37.85    -122.25  3.42200
5       4.0368      52.0    4.761658   1.103627       413.0     2.139896     37.85    -122.25  2.69700
6       3.6591      52.0    4.931907   0.951362      1094.0     2.128405     37.84    -122.25  2.99200
7       3.1200      52.0    4.797527   1.061824      1157.0     1.788253     37.84    -122.25  2.41400
8       2.0804      42.0    4.294118   1.117647      1206.0     2.026891 

In [12]:
m = california_housing.drop(["target"], axis = 1)

In [13]:
m.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [14]:
m.shape

(20640, 8)

In [15]:
n = california_housing["target"]

In [16]:
n.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: target, dtype: float64

In [17]:
n.shape

(20640,)

In [18]:
# splitting data into training and testing
m_train, m_test, n_train, n_test = train_test_split(m, n, test_size = 0.25, random_state = 42)

In [19]:
m_train.shape

(15480, 8)

In [20]:
m_test.shape

(5160, 8)

In [21]:
n_train.shape

(15480,)

In [22]:
n_test.shape

(5160,)

## **Model Training**

In [23]:
# Linear Regression
# Initialize model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

In [24]:
# Fit the Model
lr_model = lr_model.fit(m_train, n_train)

In [25]:
# Make Predictions
pred = lr_model.predict(m_test)

#### **1. Mean Absolute Error (MAE)**
- Mean Absolute Error is the average of the difference between the ground truth and the predicted values. 
- Strengths: Intuitive and easy to understand.
- Weaknesses: Treats all errors equally, regardless of magnitude

#### **2. Mean Squared Error (MSE)**
- It essentially finds the average of the squared difference between the target value and the value predicted by the regression model.
- Strengths: Highlights large errors.
- Weaknesses: Sensitive to outliers.

#### **3. R-squared (R²)**
- Strengths: Indicates model fit.
- Weaknesses: Cannot detect overfitting. Negative values suggest poor performance.

In [33]:
print(n_test.to_string())

20046    0.47700
3024     0.45800
15663    5.00001
20484    2.18600
9814     2.78000
13311    1.58700
7113     1.98200
7668     1.57500
18246    3.40000
5723     4.46600
20069    1.23200
6835     2.53900
11351    2.15100
20267    2.20500
7097     2.19800
6298     1.36200
696      1.78400
19607    1.87500
14173    1.39800
19638    1.37500
18332    4.25000
4691     4.05600
2323     1.38800
16880    5.00001
14521    1.62100
19833    0.51400
8383     1.51300
2647     0.94800
1456     2.04100
5678     3.61500
341      0.85100
4852     1.38400
14386    1.37500
18261    4.65000
16105    2.71800
9149     2.39400
16879    3.40900
6253     1.50400
11741    1.28100
7330     1.56000
20460    2.58100
2412     0.90500
19391    0.71300
11441    2.19700
13332    1.51000
3721     2.90100
14986    1.42000
14152    2.91500
5654     2.55400
2578     0.98100
19332    2.51900
3376     1.62100
15118    1.29400
9620     0.89400
7191     1.50500
19932    1.33800
13706    0.80400
6220     2.11100
14641    2.556

In [37]:
pred

array([0.72412832, 1.76677807, 2.71151581, ..., 1.72382152, 2.34689276,
       3.52917352])

In [29]:
# Evaluate Model Performance
mae = mean_absolute_error(n_test, pred)

mse = mean_squared_error(n_test, pred)

r2 = r2_score(n_test, pred)

In [32]:
print(f"Mean Absolute Error (mae): {mae}")
print(f"Mean Square Error (mse): {mse}")
print(f"R-Squared (r2): {r2}")

Mean Absolute Error (mae): 0.5296964012919458
Mean Square Error (mse): 0.5411287478470685
R-Squared (r2): 0.59
