# **SQL Insert Statements**
*This is where I output full SQL INSERT statements from csv files so we can load them into SQL schema*

In [2]:
# libs
import pandas as pd
import re

## **1. Age-Sex filter, Alcohol Consumption**

In [None]:
# === Load and prepare the data ===
df = pd.read_csv("alco_tidy.csv")
df.columns = df.columns.str.strip()

df = df.rename(columns={
    "alco_sex": "filter_sex",
    "alco_age_group": "age_group",
    "alco_category_group": "alco_fact_status",
    "alco_category": "alco_fact_label",
    "alco_estimate_000": "alco_fact_est_000"
})

# === Parse age group ===
df[['filter_age_start', 'filter_age_end']] = df['age_group'].str.extract(r'(\d+)[–-](\d+)').astype(float)

# === Parse bounds from label ===
def extract_bounds(label):
    if not isinstance(label, str):
        return None, None
    label = label.lower()
    nums = re.findall(r'(\d+\.?\d*)', label)
    if "more than" in label and nums:
        return float(nums[0]), None
    elif "less than" in label and nums:
        return None, float(nums[0])
    elif "between" in label and len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    elif len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    elif len(nums) == 1:
        return float(nums[0]), float(nums[0])
    else:
        return None, None

df[['alco_fact_low_bound', 'alco_fact_up_bound']] = df['alco_fact_label'].apply(
    lambda x: pd.Series(extract_bounds(x))
)

# === Build AGE_SEX_FILTER and assign filter_id ===
age_sex = df[['filter_age_start', 'filter_age_end', 'filter_sex']].drop_duplicates().reset_index(drop=True)
age_sex['filter_id'] = range(1, len(age_sex) + 1)

# Merge filter_id back into original df
df = df.merge(age_sex, on=['filter_age_start', 'filter_age_end', 'filter_sex'], how='left')

# === Generate INSERTS ===

# 1. AGE_SEX_FILTER
age_sex_columns = ['filter_id', 'filter_age_start', 'filter_age_end', 'filter_sex']
age_sex_lines = [
    "INSERT INTO AGE_SEX_FILTER (filter_id, filter_age_start, filter_age_end, filter_sex) VALUES"
]
for _, row in age_sex.iterrows():
    vals = []
    for col in age_sex_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(int(val)) if 'filter_id' in col else str(val))
    age_sex_lines.append(f"({', '.join(vals)})")
age_sex_lines[-1] += ';'

# 2. ALCOHOL_FACT
alco_columns = ['filter_id', 'alco_fact_status', 'alco_fact_label',
                'alco_fact_low_bound', 'alco_fact_up_bound', 'alco_fact_est_000']
alco_lines = [
    "INSERT INTO ALCOHOL_FACT (filter_id, alco_fact_status, alco_fact_label, alco_fact_low_bound, alco_fact_up_bound, alco_fact_est_000) VALUES"
]
for _, row in df[alco_columns].iterrows():
    vals = []
    for col in alco_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(val))
    alco_lines.append(f"({', '.join(vals)})")
alco_lines[-1] += ';'

# === Write output SQL files ===
with open("insert_age_sex_filter.sql", "w") as f:
    f.write("\n".join(age_sex_lines))
print("- insert_age_sex_filter.sql")

with open("insert_alcohol_fact.sql", "w") as f:
    f.write("\n".join(alco_lines))
print("- insert_alcohol_fact.sql")

- insert_age_sex_filter.sql
- insert_alcohol_fact.sql


## **2. Smoking**

In [29]:
unwanted_ages = [
    '15–24', '15–44', '25–44', '45–64', '65–74', '75 years and over',
    '18–44', '45 years and over', 'Total 18 years and over', 'Total 15 years and over']

In [62]:
smoke_df = pd.read_csv("smoke_tidy.csv")
smoke_df.columns = smoke_df.columns.str.strip()

smoke_df = smoke_df.rename(columns={
    "smoke_sex": "filter_sex",
    "smoke_age_group": "age_group",
})

# Filter them out
smoke_df = smoke_df[~smoke_df['age_group'].isin(unwanted_ages)]

# Parse age group into start and end
smoke_df[['filter_age_start', 'filter_age_end']] = smoke_df['age_group'].str.extract(r'(\d+)[–-](\d+)').astype(float)

# Merge filter_id from age_sex (assumes 'age_sex' already created elsewhere)
smoke_df = smoke_df.merge(age_sex, on=['filter_age_start', 'filter_age_end', 'filter_sex'], how='left')
smoke_df.head(5)

Unnamed: 0,filter_sex,smoke_category_group,smoke_category,age_group,smoke_estimate_000,filter_age_start,filter_age_end,filter_id
0,Persons,Usual number of days smoked per week,1–2 days,15–17,4.4,15.0,17.0,1
1,Persons,Usual number of days smoked per week,3–6 days,15–17,1.3,15.0,17.0,1
2,Persons,Usual number of days smoked per week,Daily,15–17,14.7,15.0,17.0,1
3,Persons,Usual number of days smoked per week,Total current daily or weekly smoker,15–17,19.0,15.0,17.0,1
4,Persons,Usual number of cigarettes smoked per day,1 to 4 cigarettes,15–17,8.0,15.0,17.0,1


In [63]:
# Remove unwanted category groups
smoke_df = smoke_df[
    ~smoke_df['smoke_category_group'].isin([
        'Usual number of cigarettes smoked per day',
        'Average number of cigarettes smoked per day (7 day average)',
        'Use of electronic cigarette (e-cigarette) / vaping device'
    ])
]

# Define filters for unwanted values by group
unwanted_values = {
    "Smoker status": [
        "Total persons aged 15 years and over"
    ],
    "Usual number of days smoked per week": [
        "Total current daily or weekly smoker"
    ]}

# Filter out the unwanted rows
for group, values in unwanted_values.items():
    smoke_df = smoke_df[
        ~((smoke_df['smoke_category_group'] == group) &
          (smoke_df['smoke_category'].isin(values)))
    ]

In [70]:
# First, separate the 2 groups
status_df = smoke_df[smoke_df['smoke_category_group'] == 'Smoker status'].copy()
freq_df = smoke_df[smoke_df['smoke_category_group'] == 'Usual number of days smoked per week'].copy()

# === Ex/Never smokers ===
# Only keep rows where status is Ex or Never smoked
non_current_df = status_df[status_df['smoke_category'].isin(['Ex-smoker', 'Never smoked'])].copy()
non_current_df['smo_fact_status'] = non_current_df['smoke_category']
non_current_df['smo_fact_frequency'] = None
non_current_df['smoke_estimate_000'] = non_current_df['smoke_estimate_000']

# === Current smoker frequencies ===
# Keep only 'Current smoker' rows
current_status_df = status_df[status_df['smoke_category'] == 'Current smoker'].copy()
current_freq_df = freq_df.copy()

# Cross join status and frequency on same filter_id
current_combined = current_status_df.merge(
    current_freq_df,
    on='filter_id',
    suffixes=('_status', '_freq'))

# Rename to match schema
current_combined = current_combined.rename(columns={
    'smoke_category_status': 'smo_fact_status',
    'smoke_category_freq': 'smo_fact_frequency',
    'smoke_estimate_000_freq': 'smoke_estimate_000'
})

# === Combine all ===
smoke_fact_df = pd.concat([
    non_current_df[['filter_id', 'smo_fact_status', 'smo_fact_frequency', 'smoke_estimate_000']],
    current_combined[['filter_id', 'smo_fact_status', 'smo_fact_frequency', 'smoke_estimate_000']]
], ignore_index=True)

# Rename for consistency with schema
smoke_fact_df = smoke_fact_df.rename(columns={
    "smoke_estimate_000": "smo_fact_est_000"
})

smoke_fact_df.head(5)

Unnamed: 0,filter_id,smo_fact_status,smo_fact_frequency,smo_fact_est_000
0,1,Ex-smoker,,27.9
1,1,Never smoked,,870.0
2,2,Ex-smoker,,8.4
3,2,Never smoked,,457.5
4,3,Ex-smoker,,10.2


`smoke_fact_df` DataFrame is finalized and structured to match our schema, we can generate the SQL INSERT statements for the `SMOKE_FACT` table:

In [71]:
# Ensure column order
smoke_columns = ['filter_id', 'smo_fact_status', 'smo_fact_frequency', 'smo_fact_est_000']

# Generate SQL INSERT lines
smoke_lines = [
    "INSERT INTO SMOKE_FACT (filter_id, smo_fact_status, smo_fact_frequency, smo_fact_est_000) VALUES"
]

for _, row in smoke_fact_df.iterrows():
    vals = []
    for col in smoke_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(val))
    smoke_lines.append(f"({', '.join(vals)})")

# Final semicolon for last line
smoke_lines[-1] += ';'

# Write to file
with open("insert_smoke_fact.sql", "w") as f:
    f.write("\n".join(smoke_lines))

print("- insert_smoke_fact.sql written.")

- insert_smoke_fact.sql written.
