3.	Perform the following operations in python on given dataset [housing.csv]
a.	Subset houses with median income > 5 and average rooms < 6.
b.	Merge with a regional lookup table mapping latitude/longitude to regions.
c.	Sort by median_house_value and population.
d.	Transpose statistics summary to compare features.
e.	Reshape the data to view average house value across income and housing age bins.

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("housing.csv")

In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# a. Subset houses with median income > 5 and average rooms < 6

In [8]:
subset_df = df[(df['median_income'] > 5) & (df['total_rooms'] / df['households'] < 6)]
subset_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
61,-122.29,37.82,49.0,135.0,29.0,86.0,23.0,6.1183,75000.0,NEAR BAY
110,-122.25,37.82,52.0,1424.0,289.0,550.0,253.0,5.0917,262500.0,NEAR BAY
125,-122.22,37.85,28.0,5287.0,1048.0,2031.0,956.0,5.457,337300.0,NEAR BAY
127,-122.21,37.84,44.0,3424.0,597.0,1358.0,597.0,6.0194,292300.0,NEAR BAY


# b. Merge with a regional lookup table (latitude/longitude → region)

In [11]:
region_lookup = pd.DataFrame({
    'region': ['North', 'Central', 'South'],
    'lat_min': [37, 34, 32],
    'lat_max': [40, 37, 34]
})

In [13]:
def map_region(lat):
    for _, row in region_lookup.iterrows():
        if row['lat_min'] <= lat < row['lat_max']:
            return row['region']
    return 'Other'

In [15]:
df['region'] = df['latitude'].apply(map_region)
print(df[['latitude', 'region']].drop_duplicates().head())

    latitude region
0      37.88  North
1      37.86  North
2      37.85  North
6      37.84  North
33     37.83  North


# c. Sort by median_house_value and population

In [18]:
sorted_df = df.sort_values(by=['median_house_value', 'population'], ascending=[True, True])
print(sorted_df[['median_house_value', 'population']].head())

       median_house_value  population
19802             14999.0        18.0
2521              14999.0        85.0
2799              14999.0       490.0
9188              14999.0       628.0
5887              17500.0       259.0


# d. Transpose statistical summary to compare features

In [21]:
summary = df.describe().transpose()
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
housing_median_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
total_rooms,20640.0,2635.763081,2181.615252,2.0,1447.75,2127.0,3148.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
households,20640.0,499.53968,382.329753,1.0,280.0,409.0,605.0,6082.0
median_income,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
median_house_value,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


# e. Reshape data to view average house value across income and housing age bins

In [24]:
df['income_bin'] = pd.cut(df['median_income'], bins=[0, 2, 4, 6, 8, np.inf], labels=['0–2', '2–4', '4–6', '6–8', '8+'])
df['age_bin'] = pd.cut(df['housing_median_age'], bins=[0, 15, 30, 45, 60, np.inf], labels=['0–15', '15–30', '30–45', '45–60', '60+'])

In [30]:
pivot_df = df.pivot_table(
    index='income_bin',
    columns='age_bin',
    values='median_house_value',
    aggfunc='mean',
    observed=False  
)

In [32]:
print(pivot_df)

age_bin              0–15          15–30          30–45          45–60
income_bin                                                            
0–2         115393.247748  111573.161074  106909.131282  127883.617486
2–4         143417.951184  162149.903495  168882.955424  216381.252402
4–6         201718.548303  236633.878615  258595.851740  324184.828512
6–8         301106.979434  328649.874172  396455.830097  413614.181034
8+          429610.044776  452121.240602  486480.336842  475778.810000
