# import libraries

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np


# Load the dataset

In [2]:
california_housing = fetch_california_housing()
california_housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

# create df

In [3]:
# fetch data
data = california_housing.data
# fetch target
target = california_housing.target
# fetch feature_names
feature_names = california_housing.feature_names
# fetch target names
target_names = california_housing.target_names

print(f"data.shape: {data.shape}")
print(f"target.shape: {target.shape}")
print(f"len(feature_names): {len(feature_names)}")
print(f"len(target_names): {len(target_names)}")

data.shape: (20640, 8)
target.shape: (20640,)
len(feature_names): 8
len(target_names): 1


In [4]:
# add target column to data array
whole_data = np.c_[data, target]
# add target names to feature names
column_names = feature_names + target_names

# create dataframe
df = pd.DataFrame(whole_data, columns=column_names)
# print first row
df.head(1).T


Unnamed: 0,0
MedInc,8.3252
HouseAge,41.0
AveRooms,6.984127
AveBedrms,1.02381
Population,322.0
AveOccup,2.555556
Latitude,37.88
Longitude,-122.23
MedHouseVal,4.526


In [5]:
# print df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


# print info of df

In [6]:
print(f"number of rows: {df.shape[0]}")
print(f"number of columns: {df.shape[1]}")


number of rows: 20640
number of columns: 9


In [7]:
# print df ranges
df_ranges = df.max(axis=0) - df.min(axis=0)
df_ranges

MedInc            14.500200
HouseAge          51.000000
AveRooms         141.062937
AveBedrms         33.733333
Population     35679.000000
AveOccup        1242.641026
Latitude           9.410000
Longitude         10.040000
MedHouseVal        4.850020
dtype: float64

# calculate cov and corr matrices
> the differences between to matrices
> - cov matrix has values in different scales and ranges
> - because of this we can not compare different values 
> - corr matrix  values are normalised
> - all are in same scale and ranges
> - it is easy to compare values and see how they worth

In [8]:
cov = df.cov()
corr = df.corr()

In [9]:
cov

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,3.609323,-2.84614,1.536568,-0.055858,10.40098,0.370289,-0.32386,-0.057765,1.508475
HouseAge,-2.84614,158.39626,-4.772882,-0.463718,-4222.271,1.724298,0.300346,-2.728244,1.533988
AveRooms,1.536568,-4.772882,6.121533,0.993868,-202.3337,-0.124689,0.562235,-0.136518,0.433826
AveBedrms,-0.055858,-0.463718,0.993868,0.224592,-35.52723,-0.030424,0.070575,0.01267,-0.025539
Population,10.400979,-4222.270582,-202.333712,-35.527225,1282470.0,821.712002,-263.137814,226.377839,-32.212487
AveOccup,0.370289,1.724298,-0.124689,-0.030424,821.712,107.870026,0.052492,0.051519,-0.284494
Latitude,-0.32386,0.300346,0.562235,0.070575,-263.1378,0.052492,4.562293,-3.957054,-0.355326
Longitude,-0.057765,-2.728244,-0.136518,0.01267,226.3778,0.051519,-3.957054,4.014139,-0.106274
MedHouseVal,1.508475,1.533988,0.433826,-0.025539,-32.21249,-0.284494,-0.355326,-0.106274,1.331615


In [10]:
corr

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


# find entries with max abs values

In [15]:
# Upper triangle (including diagonal)
upper_corr = np.triu(corr) 
# In-place modification  
np.fill_diagonal(upper_corr, 0) 
# remove the sign from matrix 
upper_corr = np.abs(upper_corr)

flat = upper_corr.flatten()
# Get 3 largest (unsorted)
top3 = np.partition(flat, -3)[-3:]  
# find big value's indices in descending order
indices = np.argsort(flat)[::-1]   
top3_indices = indices[:3]
print("Flat indices:", top3_indices)
# find indices in 2-D matrix
rows, cols = np.unravel_index(top3_indices, upper_corr.shape)
positions = list(zip(rows, cols))
print("Top 3 positions:", positions)


Flat indices: [61 21  8]
Top 3 positions: [(np.int64(6), np.int64(7)), (np.int64(2), np.int64(3)), (np.int64(0), np.int64(8))]


In [20]:
def find_big_values(matrix, k):
    # Upper triangle (including diagonal)
    upper = np.triu(matrix) 
    # In-place modification  
    np.fill_diagonal(upper, 0) 
    # remove the sign from matrix 
    upper = np.abs(upper)
    
    flat = upper.flatten()
    # Get 3 largest (unsorted)
    top3 = np.partition(flat, -k)[-k:]  
    # find big value's indices in descending order
    indices = np.argsort(flat)[::-1]   
    top3_indices = indices[:k]
    print("Flat indices:", top3_indices)
    # find indices in 2-D matrix
    rows, cols = np.unravel_index(top3_indices, upper.shape)
    positions = list(zip(rows, cols))
    print("Top 3 positions:", positions)
    return positions

find_big_values(corr, 3)

Flat indices: [61 21  8]
Top 3 positions: [(np.int64(6), np.int64(7)), (np.int64(2), np.int64(3)), (np.int64(0), np.int64(8))]


[(np.int64(6), np.int64(7)),
 (np.int64(2), np.int64(3)),
 (np.int64(0), np.int64(8))]

# based on output
> - the blocks with big incomes, have houses with big values
> - houses which have more rooms, have also more bedrooms
> - house are placed in diagonal geographical order

In [21]:
# find most related columns
related_names = []
for row, col in find_big_values(corr, 3):
    related_names.append((column_names[row], column_names[col], corr.iloc[row, col]))

related_names

Flat indices: [61 21  8]
Top 3 positions: [(np.int64(6), np.int64(7)), (np.int64(2), np.int64(3)), (np.int64(0), np.int64(8))]


[('Latitude', 'Longitude', np.float64(-0.9246644339150366)),
 ('AveRooms', 'AveBedrms', np.float64(0.8476213257130424)),
 ('MedInc', 'MedHouseVal', np.float64(0.6880752079585484))]

# find big corr values in their related cov values

In [34]:
for row, col in positions:
    print(f"corr[{row}, {col}]: {corr.iloc[row, col]}, \tcov[{row}, {col}]: {cov.iloc[row, col]}")

corr[6, 7]: -0.9246644339150366, 	cov[6, 7]: -3.957053721340572
corr[2, 3]: 0.8476213257130424, 	cov[2, 3]: 0.9938678006790185
corr[0, 8]: 0.6880752079585484, 	cov[0, 8]: 1.5084748279266043


# find big cov values in their related corr values

In [22]:
for row, col in find_big_values(cov, 3):
    print(f"corr[{row}, {col}]: {corr.iloc[row, col]}, \tcov[{row}, {col}]: {cov.iloc[row, col]}")

Flat indices: [13 41 42]
Top 3 positions: [(np.int64(1), np.int64(4)), (np.int64(4), np.int64(5)), (np.int64(4), np.int64(6))]
corr[1, 4]: -0.29624423977353637, 	cov[1, 4]: -4222.270582223539
corr[4, 5]: 0.06986273036567692, 	cov[4, 5]: 821.7120016336573
corr[4, 6]: -0.10878474737766855, 	cov[4, 6]: -263.13781367855165


### üìê Covariance Formula
For two random variables \(X\) and \(Y\) with \(n\) observations:

$$
\text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} \big( x_i - \bar{x} \big)\big( y_i - \bar{y} \big)
$$

- $x_i, y_i$: the $i$-th observation of $X$ and $Y$  
- $\bar{x}, \bar{y}$: the mean of $X$ and $Y$  
- $n$: number of observations
- multiplication of big ranges causes generating big values
- changig scales from for example from m to cm causes one values get multiplied by 100

---

### üìä Correlation Formula (Pearson‚Äôs $r$)
Correlation is the normalized covariance:

$$
\text{Corr}(X, Y) = \frac{\text{Cov}(X, Y)}{\sigma_X \cdot \sigma_Y}
$$

- $\sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2}$ ‚Üí standard deviation of $X$  
- $\sigma_Y = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \bar{y})^2}$ ‚Üí standard deviation of $Y$
- division to std of each feature removes the scales effect
- changig scales from for example from m to cm causes one values get multiplied by 100 and division to std of the same variable removes the 100
- it is better to rely on on corr not cov. because changing scales does not affect the corr values