<a href="https://colab.research.google.com/github/Theeyecode/Housing-Stress-Canada/blob/eda/descriptive_stat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploratory Data Analysis for Canada Housing Survery Data 2022

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


url = "https://drive.google.com/uc?id=11Y8p_9-CYw0tpGPFu-jlgzxzWOOVS43F"

In [2]:


df = pd.read_csv(url)
df.head()

Unnamed: 0,PUMFID,EHA_10,EHA_25,FP_05,DWI_05A,DWI_05B,DWI_05C,DWI_05D,NEI_05A,NEI_05B,...,PSTIR_GR,PVISMIN,PWSA_D15,P2DCT_20,P2DCT_25,PATT_05,PATT_10,PATT_15A,PATT_15B,VERDATE
0,63501,4,2,1,2,2,2,2,4,4,...,1,9,999.6,996,6,1,1,6,6,11/08/2025
1,63502,3,2,2,2,2,2,2,4,4,...,1,2,999.6,996,6,2,1,6,6,11/08/2025
2,63503,3,2,1,2,2,2,2,4,4,...,1,2,999.6,996,6,2,1,6,6,11/08/2025
3,63504,3,2,1,2,1,2,2,3,4,...,1,1,999.6,996,6,2,1,6,6,11/08/2025
4,63505,4,2,1,2,2,2,2,4,4,...,1,2,999.6,2,1,1,2,6,3,11/08/2025


Shape of the Raw data


In [3]:
df.shape

(38657, 103)

In [4]:
# df.dtypes
df.dtypes.value_counts()


Unnamed: 0,count
int64,98
float64,4
object,1


In [5]:
# Identify non-numeric columns (typically dates or text fields)
df.select_dtypes(include="object").columns.tolist()

['VERDATE']

In [6]:
# Convert verification date to datetime for proper handling
df["VERDATE"] = pd.to_datetime(df["VERDATE"], errors="coerce")

In [7]:
# # Check missing values per column (after initial load)
df.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
PUMFID,0
EHA_10,0
EHA_25,0
FP_05,0
DWI_05A,0
...,...
PATT_05,0
PATT_10,0
PATT_15A,0
PATT_15B,0


In [8]:
# Get basic descriptive stats for numeric columns (unweighted, structure check)
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
PUMFID,38657.0,82829.0,63501.0,73165.0,82829.0,92493.0,102157.0,11159.459015
EHA_10,38657.0,2.888119,1.0,2.0,3.0,4.0,9.0,1.076674
EHA_25,38657.0,1.953126,1.0,2.0,2.0,2.0,9.0,0.327064
FP_05,38657.0,1.028119,1.0,1.0,1.0,1.0,9.0,0.294491
DWI_05A,38657.0,1.96337,1.0,2.0,2.0,2.0,9.0,0.446082
...,...,...,...,...,...,...,...,...
PATT_05,38657.0,6.103733,0.0,1.0,1.0,2.0,99.0,21.497314
PATT_10,38657.0,1.87803,1.0,1.0,1.0,2.0,9.0,1.892823
PATT_15A,38657.0,5.800321,1.0,6.0,6.0,6.0,9.0,1.272822
PATT_15B,38657.0,5.399669,1.0,6.0,6.0,6.0,9.0,1.733045


In [9]:
# # Identify columns that contain obvious reserved codes (e.g., 9, 96, 99, 999, etc.)
reserved_codes = [9, 96, 99, 996, 999, 999.6, 999.9, 99999996, 99999999, 99999999999]

reserved_check = {
    col: df[col].isin(reserved_codes).any()
    for col in df.columns
    if df[col].dtype != "object"
}

[k for k, v in reserved_check.items() if v]


['EHA_10',
 'EHA_25',
 'FP_05',
 'DWI_05A',
 'DWI_05B',
 'DWI_05C',
 'DWI_05D',
 'NEI_05A',
 'NEI_05B',
 'NEI_05C',
 'NEI_05D',
 'NEI_05E',
 'NEI_05F',
 'NEI_05G',
 'NEI_05H',
 'NEI_05I',
 'WSA_05',
 'SDH_05',
 'CER_05',
 'CER_20',
 'LIS_10',
 'COS_10',
 'COS_15',
 'GH_05',
 'GH_10',
 'PMINOR',
 'PCER_10',
 'PCER_15',
 'PCHN',
 'PCOS_05',
 'PDCT_05',
 'P1DCT_20',
 'P1DCT_25',
 'PDV_SAH',
 'PDV_SUIT',
 'PDWLTYPE',
 'PDWS_10A',
 'PDWS_10B',
 'PDWS_10C',
 'PDWS_10D',
 'PDWS_10E',
 'PDWS_10F',
 'PDWS_10G',
 'PDWS_10H',
 'PDWS_10I',
 'PDWS_10J',
 'PEHA_05A',
 'PEHA_05B',
 'PEHA_05C',
 'PEMPL',
 'PFTHB5YR',
 'PHGEDUC',
 'PHHSIZE',
 'PHHTTINC',
 'PHTYPE',
 'PLIS_05',
 'PNSC_15',
 'POWN_20',
 'POWN_80',
 'PPAC_05',
 'PPAC_10',
 'PPAC_23',
 'PPAC_30',
 'PPAC_35',
 'PPAC_45A',
 'PPAC_45C',
 'PPAC_45D',
 'PPAC_45E',
 'PPAC_45F',
 'PPAC_45G',
 'PPAC_45H',
 'PPAC_45I',
 'PPAC_45J',
 'PPAC_45K',
 'PPAC_45L',
 'PPAC_45M',
 'PPAC_45N',
 'PPAC_45O',
 'PRSPIMST',
 'P1SCR_05',
 'PSCR_10',
 'PSCR_25',
 'P

---
## [Task 1 : Handle Reserved Codes as NA](https://emmanuelolajubu90.atlassian.net/browse/SCRUM-12)

* Identify outcome vars (PCHN, PSTIR_GR) → confirm universe + eligibility rules

* Apply logic to convert reserved codes to NA for all relevant variables, without performing any recoding at this stage.

---

In [10]:
# 1. PCHN (Core Housing Need)

print("PCHN (Original Counts):")
df['PCHN'].value_counts(dropna=False).sort_index()

PCHN (Original Counts):


Unnamed: 0_level_0,count
PCHN,Unnamed: 1_level_1
1,6164
2,30938
9,1555


In [11]:
# Create a clean version: Map 9 to NaN
df['PCHN_Clean'] = df['PCHN'].replace({9: np.nan})

In [12]:
# Print the cleaned data count

print(df['PCHN_Clean'].value_counts(dropna=False).sort_index())
print('_'*50)
print(f"Records Excluded (Not Stated): {df['PCHN_Clean'].isna().sum()}")

PCHN_Clean
1.0     6164
2.0    30938
NaN     1555
Name: count, dtype: int64
__________________________________________________
Records Excluded (Not Stated): 1555


In [13]:
# 2. PSTIR_GR (Shelter-cost-to-income ratio group)

print("Definition: 1 (<30%), 2 (30-50%), 3 (50-100%), 4 (>=100%) \n")
print("Reserved Codes: 5 = Not Applicable, 9 = Not Stated \n")

print("PSTIR_GR (Original Counts):")
df['PSTIR_GR'].value_counts(dropna=False).sort_index()

Definition: 1 (<30%), 2 (30-50%), 3 (50-100%), 4 (>=100%) 

Reserved Codes: 5 = Not Applicable, 9 = Not Stated 

PSTIR_GR (Original Counts):


Unnamed: 0_level_0,count
PSTIR_GR,Unnamed: 1_level_1
1,28650
2,6440
3,2012
4,549
5,429
9,577


In [14]:
# Map 5 and 9 to NaN

df['PSTIR_GR_Clean'] = df['PSTIR_GR'].replace({5: np.nan, 9: np.nan})

In [15]:
print("\nPSTIR_GR cleaned data counts:")
display(df['PSTIR_GR_Clean'].value_counts(dropna=False).sort_index())
print(f"\nRecords Excluded (N/A or Not Stated): {df['PSTIR_GR_Clean'].isna().sum()}")


PSTIR_GR cleaned data counts:


Unnamed: 0_level_0,count
PSTIR_GR_Clean,Unnamed: 1_level_1
1.0,28650
2.0,6440
3.0,2012
4.0,549
,1006



Records Excluded (N/A or Not Stated): 1006


In [16]:
# 3. Valid Universe Check : How many households are valid for BOTH measures?

valid_both = df.dropna(subset=['PCHN_Clean', 'PSTIR_GR_Clean'])
print(f"Total rows in dataset: {len(df)}")
print(f"Rows valid for BOTH PCHN and PSTIR_GR: {len(valid_both)}")

Total rows in dataset: 38657
Rows valid for BOTH PCHN and PSTIR_GR: 37102


### Output for Task 1

* **PCHN**: Preserved 37,102 valid households (6,164 In Need / 30,938 Not In Need) data. We excluded 1,555 "Not Stated" records.

* **PSTIR_GR**: Preserved 37,651 valid households. We excluded 1,006 records (429 "Not Applicable" + 577 "Not Stated") data.

* **Intersection**: 37,102 households have valid data for both variables, makes it valid sample data size for our analysis.

---
## [Task 2: Predictor Variable Audit](https://emmanuelolajubu90.atlassian.net/browse/SCRUM-13)

* **Goal**: "Sanitize" the independent variables (Demographics, Geography, Socio-economic).

* **Action**: Systematically identify reserved codes (e.g., 99, 99999996) for key columns like Income, Age, and Tenure to prevent them from skewing analysis.

---

In [17]:
categorical_vars = [
    'PDCT_05',   # Tenure (Owner/Renter)
    'PMINOR',    # Presence of Minor (Children < 18)
    'PVISMIN',   # Visible Minority Status (Crucial for Equity Analysis)
    'PHTYPE',    # Household Type (Lone parent, etc.)
    'PEMPL',     # Employment Status
    'PHGEDUC',   # Education Level
    'REGION',    # Region (Atlantic, QC, ON, Prairies, BC)
    'PDWLTYPE',  # Dwelling Type (Single detached, High-rise, etc.)
    'PAGEP1'     # Age of Reference Person
]

In [24]:
# 1. Audit Categorical Variables

print("\nCategorical Variables")
print(f"{'Variable':<15} | {'Valid Records':<15} | {'Reserved/Missing Codes Found'}")
print("-" * 80)

for var in categorical_vars:
    # Get counts of all values, including NaNs if any exist
    counts = df[var].value_counts(dropna=False).sort_index()

    # Check for specific reserved codes known in StatsCan PUMF
    found_reserved = []
    if 6 in counts.index: found_reserved.append(f"{counts[6]} records of 6 (Valid Skip)")
    if 9 in counts.index: found_reserved.append(f"{counts[9]} records of 9 (Not Stated)")
    if 96 in counts.index: found_reserved.append(f"{counts[96]} records of 96 (Valid Skip)")
    if 99 in counts.index: found_reserved.append(f"{counts[99]} records of 99 (Not Stated)")

    # Calculate how many "clean" records we have (excluding these codes)
    garbage_mask = df[var].isin([6, 9, 96, 99])
    valid_count = (~garbage_mask).sum()

    # Format output
    garbage_desc = ", ".join(found_reserved) if found_reserved else "None"
    print(f"{var:<15} | {valid_count:<15} | {garbage_desc}")


Categorical Variables
Variable        | Valid Records   | Reserved/Missing Codes Found
--------------------------------------------------------------------------------
PDCT_05         | 38118           | 539 records of 9 (Not Stated)
PMINOR          | 36505           | 2152 records of 9 (Not Stated)
PVISMIN         | 27407           | 11250 records of 9 (Not Stated)
PHTYPE          | 35357           | 1165 records of 6 (Valid Skip), 2135 records of 99 (Not Stated)
PEMPL           | 36992           | 1665 records of 9 (Not Stated)
PHGEDUC         | 30978           | 6400 records of 6 (Valid Skip), 1279 records of 99 (Not Stated)
REGION          | 38657           | None
PDWLTYPE        | 24749           | 12303 records of 6 (Valid Skip), 1605 records of 99 (Not Stated)
PAGEP1          | 38657           | None


In [None]:
# 2. Audit Continuous Variables (Income)

# Specific check for PHHTTINC (Total Household Income)
income_col = 'PHHTTINC'
max_val = df[income_col].max()
reserved_income_code = 99999999999

print(f"Variable: {income_col}")
print(f"Max Value found: {max_val}")

if max_val == reserved_income_code:
    count_reserved = (df[income_col] == reserved_income_code).sum()
    print(f"[FLAG] Found {count_reserved} records with Reserved Code {reserved_income_code}")
else:
    print("No standard reserved code (999...9) found as max value.")

Variable: PHHTTINC
Max Value found: 99999999999
[FLAG] Found 2026 records with Reserved Code 99999999999


In [20]:
print("\nContinuous Variable: PHHTTINC")
income_col = 'PHHTTINC'
reserved_income_code = 99999999999
max_val = df[income_col].max()

if max_val == reserved_income_code:
    count_reserved = (df[income_col] == reserved_income_code).sum()
    print(f"-> Found reserved code {reserved_income_code} in {count_reserved} records.")

    # Calculate stats on the clean data only
    clean_income = df[df[income_col] != reserved_income_code][income_col]
    print(f"-> Valid Income Range: ${clean_income.min():,.0f} to ${clean_income.max():,.0f}")
    print(f"-> Median Income: ${clean_income.median():,.0f}")
else:
    print("-> No standard reserved code (99...9) found.")


Continuous Variable: PHHTTINC
-> Found reserved code 99999999999 in 2026 records.
-> Valid Income Range: $-72,500 to $975,000
-> Median Income: $60,000
