# **SQL Insert Statements**
*This is where I output full SQL INSERT statements from csv files so we can load them into SQL schema*

In [2]:
# libs
import pandas as pd
import re

## **1. Age-Sex filter, Alcohol Consumption, Smoking**

In [4]:
# === Load and prepare the data ===
df = pd.read_csv("alco_tidy.csv")
df.columns = df.columns.str.strip()

df = df.rename(columns={
    "alco_sex": "filter_sex",
    "alco_age_group": "age_group",
    "alco_category_group": "alco_fact_status",
    "alco_category": "alco_fact_label",
    "alco_estimate_000": "alco_fact_est_000"
})

# === Parse age group ===
df[['filter_age_start', 'filter_age_end']] = df['age_group'].str.extract(r'(\d+)[–-](\d+)').astype(float)

# === Parse bounds from label ===
def extract_bounds(label):
    if not isinstance(label, str):
        return None, None
    label = label.lower()
    nums = re.findall(r'(\d+\.?\d*)', label)
    if "more than" in label and nums:
        return float(nums[0]), None
    elif "less than" in label and nums:
        return None, float(nums[0])
    elif "between" in label and len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    elif len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    elif len(nums) == 1:
        return float(nums[0]), float(nums[0])
    else:
        return None, None

df[['alco_fact_low_bound', 'alco_fact_up_bound']] = df['alco_fact_label'].apply(
    lambda x: pd.Series(extract_bounds(x))
)

# === Build AGE_SEX_FILTER and assign filter_id ===
age_sex = df[['filter_age_start', 'filter_age_end', 'filter_sex']].drop_duplicates().reset_index(drop=True)
age_sex['filter_id'] = range(1, len(age_sex) + 1)

# Merge filter_id back into original df
df = df.merge(age_sex, on=['filter_age_start', 'filter_age_end', 'filter_sex'], how='left')

# === Generate INSERTS ===

# 1. AGE_SEX_FILTER
age_sex_columns = ['filter_id', 'filter_age_start', 'filter_age_end', 'filter_sex']
age_sex_lines = [
    "INSERT INTO AGE_SEX_FILTER (filter_id, filter_age_start, filter_age_end, filter_sex) VALUES"
]
for _, row in age_sex.iterrows():
    vals = []
    for col in age_sex_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(int(val)) if 'filter_id' in col else str(val))
    age_sex_lines.append(f"({', '.join(vals)})")
age_sex_lines[-1] += ';'

# 2. ALCOHOL_FACT
alco_columns = ['filter_id', 'alco_fact_status', 'alco_fact_label',
                'alco_fact_low_bound', 'alco_fact_up_bound', 'alco_fact_est_000']
alco_lines = [
    "INSERT INTO ALCOHOL_FACT (filter_id, alco_fact_status, alco_fact_label, alco_fact_low_bound, alco_fact_up_bound, alco_fact_est_000) VALUES"
]
for _, row in df[alco_columns].iterrows():
    vals = []
    for col in alco_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(val))
    alco_lines.append(f"({', '.join(vals)})")
alco_lines[-1] += ';'

# === Write output SQL files ===
with open("insert_age_sex_filter.sql", "w") as f:
    f.write("\n".join(age_sex_lines))
print("- insert_age_sex_filter.sql")

with open("insert_alcohol_fact.sql", "w") as f:
    f.write("\n".join(alco_lines))
print("- insert_alcohol_fact.sql")

- insert_age_sex_filter.sql
- insert_alcohol_fact.sql


In [5]:
# === Load and prepare SMOKE data ===
smoke_df = pd.read_csv("smoke_tidy.csv")
smoke_df.columns = smoke_df.columns.str.strip()

smoke_df = smoke_df.rename(columns={
    "smoke_sex": "filter_sex",
    "smoke_age_group": "age_group",
    "smoke_category_group": "category_group",
    "smoke_category": "category_value",
    "smoke_estimate_000": "smo_fact_est_000"
})

# Parse age group into start and end
smoke_df[['filter_age_start', 'filter_age_end']] = smoke_df['age_group'].str.extract(r'(\d+)[–-](\d+)').astype(float)

# Merge filter_id from age_sex
smoke_df = smoke_df.merge(age_sex, on=['filter_age_start', 'filter_age_end', 'filter_sex'], how='left')

# Extract status, device, frequency from category_group logic
def map_smoke_fields(row):
    status = device = freq = None
    if row['category_group'] == 'Smoker_status':
        status = row['category_value']
    elif row['category_group'] == 'Smoker_device':
        device = row['category_value']
    elif row['category_group'] == 'Smoker_frequency':
        freq = row['category_value']
    return pd.Series([status, device, freq])

smoke_df[['smo_fact_status', 'smo_fact_device', 'smo_fact_frequency']] = smoke_df.apply(map_smoke_fields, axis=1)

# Optional: remove unpaired rows
# smoke_df = smoke_df[smoke_df['smo_fact_status'].notna()]

# Group to combine estimate values across rows
smoke_grouped = smoke_df.groupby([
    'filter_id', 'smo_fact_status', 'smo_fact_device', 'smo_fact_frequency'
], dropna=False).agg({
    'smo_fact_est_000': 'sum'
}).reset_index()

# === Generate INSERTS for SMOKE_FACT ===
smoke_columns = ['filter_id', 'smo_fact_status', 'smo_fact_device', 'smo_fact_frequency', 'smo_fact_est_000']
smoke_lines = [
    "INSERT INTO SMOKE_FACT (filter_id, smo_fact_status, smo_fact_device, smo_fact_frequency, smo_fact_est_000) VALUES"
]
for _, row in smoke_grouped.iterrows():
    vals = []
    for col in smoke_columns:
        val = row[col]
        if pd.isna(val):
            vals.append("NULL")
        elif isinstance(val, str):
            vals.append(f"'{val}'")
        else:
            vals.append(str(val))
    smoke_lines.append(f"({', '.join(vals)})")
smoke_lines[-1] += ';'

# Write to file
with open("insert_smoke_fact.sql", "w") as f:
    f.write("\n".join(smoke_lines))
print("- insert_smoke_fact.sql")

- insert_smoke_fact.sql
