# **Data Transformation**

*This is where we do data wrangling and EDA*

In [17]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## **1. Alcolhol Consumption**

For SQL, we need a tidy format where each row is one observation:
- `alco_sex`: gender ("Persons", "Males", "Females")
- `alco_category_group`: high-level grouping (e.g. "Exceeded guideline(e)", "Did not exceed guideline", etc.)
- `alco_category`: specific measure under that group (e.g. "Consumed more than 10 drinks in the last week", "Consumed 5 or more drinks on any day...", "Total exceeded guideline")
- `alco_age_group`: demographic column from the headers (e.g. "15–17(c)", "18–24", "25–34", "65 years and over", "Total 18 years and over")
- `alco_estimate_000`: the numeric value in the table (population count in '000s)

The final dataframe named `alco_consumption`


In [31]:
# Get all sheet names
alco_sheets = pd.ExcelFile("datasets/Alcohol_Consumption.xlsx").sheet_names
print(alco_sheets)

['Contents', 'Table 7.1_Estimates', 'Table 7.2_RSEs', 'Table 7.3_Proportions', 'Table 7.4_MoEs']


In [None]:
# load 'Table 7.1_Estimates' from the Alcohol_Consumption file
alco_raw = pd.read_excel("datasets/Alcohol_Consumption.xlsx", sheet_name='Table 7.1_Estimates', header=3)
alco_raw = alco_raw.dropna(axis=1, how='all')  # drop all empty columns
alco_raw.head(10)

Unnamed: 0,"National Health Survey, 2022 — Australia",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,Age group (years),,,,,,,,,,,,,,,,
1,,15–17(c),18–24,25–34,35–44,45–54,55–64,65 years and over,15–24,15–44,25–44,45–64,65–74,75 years and over,18–44,45 years and over,Total 18 years and over,Total 15 years and over
2,,ESTIMATE ('000),,,,,,,,,,,,,,,,
3,,PERSONS AGED 15 YEARS AND OVER(d),,,,,,,,,,,,,,,,
4,Persons,,,,,,,,,,,,,,,,,
5,Exceeded guideline(e),,,,,,,,,,,,,,,,,
6,Consumed more than 10 drinks in the last week,22,318.6,508.9,636,665.6,731.8,802.1,346.3,1487,1138.9,1401.9,519.9,285.5,1460.2,2207.4,3668,3687.5
7,Consumed 5 or more drinks on any day in the la...,21.1,732.9,860.3,734.1,682.1,593.6,448.9,761.1,2343.1,1584,1275.6,337.1,113,2322.7,1736.3,4056.3,4080.3
8,Total exceeded guideline,45.9,783.6,948.8,890.6,894,876.1,918.5,829.3,2657.7,1843.6,1774.2,606.2,313.4,2615.2,2684.9,5298.4,5350.2
9,Did not exceed guideline,,,,,,,,,,,,,,,,,


In [None]:
# Row index 1 (0-based) holds the age headers across columns 1..end
alco_age_row = alco_raw.iloc[1, 1:]   # second row, excluding the first column

# Build new column names: "row_label" for the first col + age groups from row 1
new_cols = ["row_label"] + alco_age_row.fillna("").tolist()

# Apply these clean names to the DataFrame
alco_raw.columns = new_cols

# Inspect to confirm
print(alco_raw.columns)
alco_raw.head(10)

Index(['row_label', '15–17(c)', '18–24', '25–34', '35–44', '45–54', '55–64',
       '65 years and over', '15–24', '15–44', '25–44', '45–64', '65–74',
       '75 years and over', '18–44', '45 years and over',
       'Total 18 years and over', 'Total 15 years and over'],
      dtype='object')


Unnamed: 0,row_label,15–17(c),18–24,25–34,35–44,45–54,55–64,65 years and over,15–24,15–44,25–44,45–64,65–74,75 years and over,18–44,45 years and over,Total 18 years and over,Total 15 years and over
0,,Age group (years),,,,,,,,,,,,,,,,
1,,15–17(c),18–24,25–34,35–44,45–54,55–64,65 years and over,15–24,15–44,25–44,45–64,65–74,75 years and over,18–44,45 years and over,Total 18 years and over,Total 15 years and over
2,,ESTIMATE ('000),,,,,,,,,,,,,,,,
3,,PERSONS AGED 15 YEARS AND OVER(d),,,,,,,,,,,,,,,,
4,Persons,,,,,,,,,,,,,,,,,
5,Exceeded guideline(e),,,,,,,,,,,,,,,,,
6,Consumed more than 10 drinks in the last week,22,318.6,508.9,636,665.6,731.8,802.1,346.3,1487,1138.9,1401.9,519.9,285.5,1460.2,2207.4,3668,3687.5
7,Consumed 5 or more drinks on any day in the la...,21.1,732.9,860.3,734.1,682.1,593.6,448.9,761.1,2343.1,1584,1275.6,337.1,113,2322.7,1736.3,4056.3,4080.3
8,Total exceeded guideline,45.9,783.6,948.8,890.6,894,876.1,918.5,829.3,2657.7,1843.6,1774.2,606.2,313.4,2615.2,2684.9,5298.4,5350.2
9,Did not exceed guideline,,,,,,,,,,,,,,,,,


In [None]:
# Drop first 4 rows (indices 0,1,2,3)
alco_raw = alco_raw.drop(index=[0, 1, 2, 3]).reset_index(drop=True)
alco_raw.head(8)

Unnamed: 0,row_label,15–17(c),18–24,25–34,35–44,45–54,55–64,65 years and over,15–24,15–44,25–44,45–64,65–74,75 years and over,18–44,45 years and over,Total 18 years and over,Total 15 years and over
0,Persons,,,,,,,,,,,,,,,,,
1,Exceeded guideline(e),,,,,,,,,,,,,,,,,
2,Consumed more than 10 drinks in the last week,22.0,318.6,508.9,636.0,665.6,731.8,802.1,346.3,1487.0,1138.9,1401.9,519.9,285.5,1460.2,2207.4,3668.0,3687.5
3,Consumed 5 or more drinks on any day in the la...,21.1,732.9,860.3,734.1,682.1,593.6,448.9,761.1,2343.1,1584.0,1275.6,337.1,113.0,2322.7,1736.3,4056.3,4080.3
4,Total exceeded guideline,45.9,783.6,948.8,890.6,894.0,876.1,918.5,829.3,2657.7,1843.6,1774.2,606.2,313.4,2615.2,2684.9,5298.4,5350.2
5,Did not exceed guideline,,,,,,,,,,,,,,,,,
6,Consumed alcohol in the last week but did not ...,67.7,432.0,939.5,1091.8,933.7,877.3,1238.7,500.4,2528.3,2029.7,1809.3,698.9,537.6,2463.4,3043.0,5514.2,5568.7
7,Did not consume alcohol in the last week but d...,192.9,545.8,1004.3,807.8,705.6,551.4,792.2,743.1,2565.4,1820.5,1255.4,488.7,307.7,2374.5,2054.3,4426.3,4613.0


In [None]:
# drop last 12 rows (footnotes)
if len(alco_raw) >= 12:
    alco_raw = alco_raw.iloc[:-12].copy()
    
# Identify age columns (except 'row_label') 
alco_age_cols = [c for c in alco_raw.columns if c != "row_label"]

# Identify header rows
is_header_row = alco_raw[alco_age_cols].isna().all(axis=1)

# Sex headers
alco_sex_labels = {"Persons", "Males", "Females"}
is_sex_header = is_header_row & alco_raw["row_label"].isin(alco_sex_labels)

# Category group headers
is_group_header = is_header_row & ~is_sex_header

In [None]:
# Sex column
alco_raw["alco_sex"] = pd.NA
alco_raw.loc[is_sex_header, "alco_sex"] = alco_raw.loc[is_sex_header, "row_label"]
alco_raw["alco_sex"] = alco_raw["alco_sex"].ffill()

# Category group column
alco_raw["alco_category_group"] = pd.NA
alco_raw.loc[is_group_header, "alco_category_group"] = alco_raw.loc[is_group_header, "row_label"]
alco_raw["alco_category_group"] = alco_raw["alco_category_group"].ffill()

In [None]:
# Remove header rows to keep only data rows
alco_data = alco_raw.loc[~is_header_row].copy()

# Rename row_label → alco_category
alco_data = alco_data.rename(columns={"row_label": "alco_category"})

In [None]:
# melt the DataFrame to long format
alco_consumption = alco_data.melt(
    id_vars=["alco_sex", "alco_category_group", "alco_category"],
    value_vars=alco_age_cols,
    var_name="alco_age_group",
    value_name="alco_estimate_000"
)

# Clean up
alco_consumption = alco_consumption.dropna(subset=["alco_estimate_000"]).copy()
alco_consumption["alco_estimate_000"] = pd.to_numeric(alco_consumption["alco_estimate_000"], errors="coerce")

# Reorder columns
alco_consumption = alco_consumption[
    ["alco_sex", "alco_category_group", "alco_category", "alco_age_group", "alco_estimate_000"]]

In [None]:
alco_consumption

Unnamed: 0,alco_sex,alco_category_group,alco_category,alco_age_group,alco_estimate_000
0,Persons,Exceeded guideline(e),Consumed more than 10 drinks in the last week,15–17(c),22.0
1,Persons,Exceeded guideline(e),Consumed 5 or more drinks on any day in the la...,15–17(c),21.1
2,Persons,Exceeded guideline(e),Total exceeded guideline,15–17(c),45.9
3,Persons,Did not exceed guideline,Consumed alcohol in the last week but did not ...,15–17(c),67.7
4,Persons,Did not exceed guideline,Did not consume alcohol in the last week but d...,15–17(c),192.9
...,...,...,...,...,...
947,Females,Number of standard drinks consumed on a single...,5 to less than 7,Total 15 years and over,642.7
948,Females,Number of standard drinks consumed on a single...,7 to less than 11,Total 15 years and over,435.4
949,Females,Number of standard drinks consumed on a single...,11 or more,Total 15 years and over,198.5
950,Females,Number of standard drinks consumed on a single...,Total females who exceeded guideline and consu...,Total 15 years and over,1277.3
