# Import Libraries

In [64]:
from pathlib import Path

import numpy as np
import pandas as pd
import requests

## Prepare files

In [65]:
url_housing = (
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
)
data_dir = Path.cwd().parent / "data"
csv_data = data_dir / "housing.csv"

data_dir.mkdir(exist_ok=True)

with requests.Session() as s:
    r = s.get(url_housing)

with csv_data.open("wb") as f:
    f.write(r.content)

## Question 1

In [66]:
print(f"Pandas version: {pd.__version__}")

Pandas version: 2.1.0


## Question 2

In [67]:
df = pd.read_csv(csv_data)

print(df.columns)
print(len(df.columns))

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')
10


In [68]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Question 3

In [69]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

## Question 4

In [70]:
print(df["ocean_proximity"].unique())
print(df["ocean_proximity"].nunique())

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
5


## Question 5

In [71]:
df.groupby(by=["ocean_proximity"])["median_house_value"].mean()

ocean_proximity
<1H OCEAN     240084.285464
INLAND        124805.392001
ISLAND        380440.000000
NEAR BAY      259212.311790
NEAR OCEAN    249433.977427
Name: median_house_value, dtype: float64

## Question 6

In [72]:
total_bedrooms_mean = df["total_bedrooms"].mean()
print(f"Previous average: {total_bedrooms_mean:.3f}")
total_bedrooms_mean_after = (
    df["total_bedrooms"].fillna(value=total_bedrooms_mean).mean()
)
print(f"New average: {total_bedrooms_mean_after:.3f}")

Previous average: 537.871
New average: 537.871


## Question 7

In [73]:
X = (
    df[df["ocean_proximity"] == "ISLAND"]
    .loc[:, "housing_median_age":"total_bedrooms"]
    .to_numpy()
)

XTX = X.T @ X

XTX_inv = np.linalg.inv(XTX)

y = [950, 1300, 800, 1000, 1300]

w = XTX_inv @ X.T @ y

print(w[-1])

5.699229455065586
