In [32]:
import pandas as pd
import numpy as np

In [33]:
print("The version of pandas is: ", pd.__version__)

The version of pandas is:  2.1.0


### Reading data

In [3]:
raw_data = pd.read_csv("./data/housing.csv")

In [4]:
raw_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
print("The dataset contains: \n\t * {} rows \n\t * {} cols".format(raw_data.shape[0], raw_data.shape[1]))

The dataset contains: 
	 * 20640 rows 
	 * 10 cols


### Missing data

In [6]:
raw_data.isnull().mean()

longitude             0.000000
latitude              0.000000
housing_median_age    0.000000
total_rooms           0.000000
total_bedrooms        0.010029
population            0.000000
households            0.000000
median_income         0.000000
median_house_value    0.000000
ocean_proximity       0.000000
dtype: float64

### Unique values

In [7]:
unique_values = raw_data.ocean_proximity.unique()
print("The number of unique values in the 'ocean_proximity' is: ", len(unique_values))

The number of unique values in the 'ocean_proximity' is:  5


### Average

In [8]:
raw_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [9]:
raw_data.median_house_value.mean()

206855.81690891474

### Has it changed?

In [10]:
average = raw_data.total_bedrooms.mean()
print("Average of 'total bedroom' is: ", average)

Average of 'total bedroom' is:  537.8705525375618


In [14]:
raw_data.total_bedrooms.isnull().sum()

0

In [12]:
raw_data.total_bedrooms.fillna(average, inplace=True)

In [13]:
average_fillna = raw_data.total_bedrooms.mean()
print("Average of 'total bedroom' is: ", average_fillna)

Average of 'total bedroom' is:  537.8705525375617


There is not a change.

### Implementing linear regression

In [21]:
islands = raw_data[raw_data["ocean_proximity"] == "ISLAND"]
islands

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [24]:
islands = islands[["housing_median_age", "total_rooms", "total_bedrooms"]]
islands

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
8314,27.0,1675.0,521.0
8315,52.0,2359.0,591.0
8316,52.0,2127.0,512.0
8317,52.0,996.0,264.0
8318,29.0,716.0,214.0


In [34]:
X = islands.to_numpy()


In [35]:
X.T@X

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [41]:
inverse = np.linalg.inv(X.T@X)

In [37]:
y = np.array([950, 1300, 800, 1000, 1300])

In [38]:
y

array([ 950, 1300,  800, 1000, 1300])

In [44]:
w = (inverse@X.T)@y

In [47]:
print("The last element of w is: ", w[-1])

The last element of w is:  5.699229455065586
