In [1]:
# Import all the tools we need

# Regular EDA (Exploratory Data Analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# We want our plots to appear inside the notebook
%matplotlib inline

# Models from Scikit-Learn (REGRESSION, not classification)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

# Model Evaluations (REGRESSION metrics)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set plot style
plt.style.use("seaborn-v0_8-darkgrid")

# Importing the data
df = pd.read_csv("../data/housing.csv")
print(f"Shape: {df.shape}") # (rows, columns)
print(f"\nFirst few rows:")
df.head()

Shape: (20640, 10)

First few rows:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [2]:
# Check what we're working with
print(df.info())
print("\n" + "="*50)
print("Missing values:")
print(df.isnull().sum())
print("\n" + "="*50)
print("Ocean proximity categories:")
print(df["ocean_proximity"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None

Missing values:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value   

## Data Exploration (Exploration Data Analysis or EDA)
The goal here is to find out more about the data and become a subject matter expert on dataset you're working with.

1. What question(s) are you trying to solve?
2. What kind of data do we have and how do we treat different types?
3. What's missing from the data and how do you deal with it?
4. Where are the outliers and why should you care about them?
5. How can you add, change or remove features to get more out of your data?

In [3]:
# # One-hot encoding is better for non-ordinal catergories
# df = pd.get_dummies(df, columns=["ocean_proximity"], prefix="ocean")

In [4]:
# Alternative: If you prefer label encoding (what you were doing)
# But use ALL categories:
proximity_map = {
    "NEAR BAY": 1,
    "INLAND": 2, 
    "<1H OCEAN": 3,
    "NEAR OCEAN": 4,
    "ISLAND": 5
}
df["ocean_proximity"] = df["ocean_proximity"].map(proximity_map)

In [5]:
# # Imput missing values in total_bedrroms column
df["total_bedrooms"] = df["total_bedrooms"].fillna(df["total_bedrooms"].median())

# Creeate useful features (feature enginneering)
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["population_per_household"] = df["population"] / df["households"]

# Drop redundant columns
df = df.drop(columns=["total_bedrooms"])

In [6]:
# Verify no missing values
print("Missing values after cleaning:")
print(df.isnull().sum())
print(f"nFinal shape: {df.shape}")
df.head()

Missing values after cleaning:
longitude                   0
latitude                    0
housing_median_age          0
total_rooms                 0
population                  0
households                  0
median_income               0
median_house_value          0
ocean_proximity             0
bedrooms_per_room           0
rooms_per_household         0
population_per_household    0
dtype: int64
nFinal shape: (20640, 12)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity,bedrooms_per_room,rooms_per_household,population_per_household
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,452600.0,1,0.146591,6.984127,2.555556
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,358500.0,1,0.155797,6.238137,2.109842
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,352100.0,1,0.129516,8.288136,2.80226
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,341300.0,1,0.184458,5.817352,2.547945
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,342200.0,1,0.172096,6.281853,2.181467


In [7]:
# df.to_csv("data/cleaned_housing_data.csv", index=False)