In [1]:
import numpy as np
import pandas as pd

### Question 1: Version of Pandas

In [2]:
pd.__version__

'1.5.3'

In [3]:
df = pd.read_csv('housing.csv')

### Question 2: Number of columns in the dataset

In [4]:
df.shape[0]

20640

### Question 3: Select columns with missing data

In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [7]:
#column having missing data
df.columns[df.isna().any()]

Index(['total_bedrooms'], dtype='object')

### Question 4: Number of unique values in the 'ocean_proximity' column

In [8]:
df.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64

In [9]:
#counting unique values in ocean_proximity column
df['ocean_proximity'].nunique()

5

In [10]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### Question 5: Average value of the 'median_house_value' for the houses near the bay

In [11]:
df.mean(numeric_only=True)

longitude               -119.569704
latitude                  35.631861
housing_median_age        28.639486
total_rooms             2635.763081
total_bedrooms           537.870553
population              1425.476744
households               499.539680
median_income              3.870671
median_house_value    206855.816909
dtype: float64

In [12]:
#Average value of the 'median_house_value' for the houses near the bay
mean_val = df.groupby('ocean_proximity')['median_house_value'].mean() 
mean_val

ocean_proximity
<1H OCEAN     240084.285464
INLAND        124805.392001
ISLAND        380440.000000
NEAR BAY      259212.311790
NEAR OCEAN    249433.977427
Name: median_house_value, dtype: float64

### Question 6: Has the mean value changed after filling missing values?

1. Calculate the average of total_bedrooms column in the dataset.
2. Use the fillna method to fill the missing values in total_bedrooms with the mean value from the previous step.
3. Now, calculate the average of total_bedrooms again.

In [13]:
#missing value in bedrooms only so checking only in this column
mean_bedrooms = df['total_bedrooms'].mean() 
mean_bedrooms

537.8705525375618

In [14]:
df['total_bedrooms'].fillna(mean_bedrooms)
df['total_bedrooms'].mean()

537.8705525375618

### Questions 7: Value of the last element of w

1. Select all the options located on islands.
2. Select only columns housing_median_age, total_rooms, total_bedrooms.
3. Get the underlying NumPy array. Let's call it X.
4. Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
5. Compute the inverse of XTX.
6. Create an array y with values [950, 1300, 800, 1000, 1300].
7. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.

In [15]:
df_island = df[df['ocean_proximity'] == 'ISLAND']
df_island

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [16]:
df2 = df_island[['housing_median_age', 'total_rooms','total_bedrooms']]
df2

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
8314,27.0,1675.0,521.0
8315,52.0,2359.0,591.0
8316,52.0,2127.0,512.0
8317,52.0,996.0,264.0
8318,29.0,716.0,214.0


In [17]:
X = df2.to_numpy()
XTX = X.T @ X
XTX

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [18]:
XTX_inv = np.linalg.inv(XTX)
XTX_inv

array([[ 9.19403586e-04, -3.66412216e-05,  5.43072261e-05],
       [-3.66412216e-05,  8.23303633e-06, -2.77534485e-05],
       [ 5.43072261e-05, -2.77534485e-05,  1.00891325e-04]])

In [19]:
y = np.array([950, 1300, 800, 1000, 1300])
y

array([ 950, 1300,  800, 1000, 1300])

In [20]:
w = (XTX_inv @ X.T) @ y
w

array([23.12330961, -1.48124183,  5.69922946])