<a href="https://colab.research.google.com/github/Ravinderram/ASDA_2025_Group_2_Portfolio/blob/main/week_02_group_2_notebook_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Basic EDA of two datasets
# Dataset 1: California Housing Prices

import kagglehub
import pandas as pd

# 1. Dataset overview
# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices


In [23]:
# Name dataset
housing_df = pd.read_csv(f"{path}/housing.csv")

In [24]:
# Get rows and columns of dataset
housing_df.shape

(20640, 10)

In [25]:
# Get column names
housing_df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [26]:
# Get column types
housing_df.dtypes

Unnamed: 0,0
longitude,float64
latitude,float64
housing_median_age,float64
total_rooms,float64
total_bedrooms,float64
population,float64
households,float64
median_income,float64
median_house_value,float64
ocean_proximity,object


In [27]:
# Get non-null counts
housing_df.notnull().sum()

Unnamed: 0,0
longitude,20640
latitude,20640
housing_median_age,20640
total_rooms,20640
total_bedrooms,20433
population,20640
households,20640
median_income,20640
median_house_value,20640
ocean_proximity,20640


In [28]:
# Get number of unique values in each column
housing_df.nunique()

Unnamed: 0,0
longitude,844
latitude,862
housing_median_age,52
total_rooms,5926
total_bedrooms,1923
population,3888
households,1815
median_income,12928
median_house_value,3842
ocean_proximity,5


In [29]:
# Get random sample of 5 rows to see example values
examples = housing_df.sample(5)
examples

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14114,-117.11,32.74,33.0,1126.0,267.0,621.0,241.0,3.2422,123100.0,NEAR OCEAN
16161,-122.49,37.78,49.0,2176.0,441.0,1040.0,448.0,4.2414,500001.0,NEAR BAY
9842,-121.9,36.6,33.0,2461.0,649.0,1234.0,601.0,2.8727,225000.0,NEAR OCEAN
10498,-117.67,33.51,18.0,1645.0,393.0,1490.0,355.0,3.4792,126400.0,<1H OCEAN
1883,-119.99,38.94,22.0,3119.0,640.0,786.0,351.0,3.0806,118500.0,INLAND


In [30]:
# Create summary table
summary_2 = pd.DataFrame({"Column name":housing_df.columns, "Data type": housing_df.dtypes, "Non-null count": housing_df.notnull().sum(), "Unique values":housing_df.nunique()})

summary_2["Examples"] = [
    ", ".join(map(str, examples[col].tolist()))
    for col in housing_df.columns
]

summary_2 = summary_2.reset_index(drop=True)

markdown_table = summary_2.to_markdown(index=False)
print(markdown_table)

| Column name        | Data type   |   Non-null count |   Unique values | Examples                                            |
|:-------------------|:------------|-----------------:|----------------:|:----------------------------------------------------|
| longitude          | float64     |            20640 |             844 | -117.11, -122.49, -121.9, -117.67, -119.99          |
| latitude           | float64     |            20640 |             862 | 32.74, 37.78, 36.6, 33.51, 38.94                    |
| housing_median_age | float64     |            20640 |              52 | 33.0, 49.0, 33.0, 18.0, 22.0                        |
| total_rooms        | float64     |            20640 |            5926 | 1126.0, 2176.0, 2461.0, 1645.0, 3119.0              |
| total_bedrooms     | float64     |            20433 |            1923 | 267.0, 441.0, 649.0, 393.0, 640.0                   |
| population         | float64     |            20640 |            3888 | 621.0, 1040.0, 1234.0, 1490.0,

In [31]:
# Descriptive Statistics for Numeric Columns

housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [32]:
# Descriptive Statistics for Categorical / Object Columns
housing_df.describe(include="object")


Unnamed: 0,ocean_proximity
count,20640
unique,5
top,<1H OCEAN
freq,9136


In [33]:
#checking duplicates in the dataframe
housing_df.duplicated().sum()
print(f"this dataset has {int(housing_df.duplicated().sum())} duplicates")

this dataset has 0 duplicates


In [34]:
# Missing Values
missing_percent = (housing_df.isna().sum() / len(housing_df)) * 100
print(housing_df.isna().sum(),missing_percent.round(2))

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64 longitude             0.0
latitude              0.0
housing_median_age    0.0
total_rooms           0.0
total_bedrooms        1.0
population            0.0
households            0.0
median_income         0.0
median_house_value    0.0
ocean_proximity       0.0
dtype: float64
