In [1]:
# ---- Setup & Imports
import pandas as pd
import numpy as np
import re

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

# Helper to show section headers nicely in output
def h(title):
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))

In [3]:
# ---- Load data ----
SIGNUPS_PATH = "customer_signups.csv"
TICKETS_PATH = "support_tickets.csv"

df = pd.read_csv("/Users/pedropalate/Desktop/customer_signups.csv")

h("Loaded: customer_signups.csv")
print(df.head())
print("\nRows, Columns:", df.shape)


Loaded: customer_signups.csv
  customer_id             name                 email signup_date     source  \
0   CUST00000    Joshua Bryant                   NaN         NaN  Instagram   
1   CUST00001   Nicole Stewart   nicole1@example.com    02-01-24   LinkedIn   
2   CUST00002     Rachel Allen   rachel2@example.com    03-01-24     Google   
3   CUST00003  Zachary Sanchez  zachary3@mailhub.org    04-01-24    YouTube   
4   CUST00004              NaN  matthew4@mailhub.org    05-01-24   LinkedIn   

  region plan_selected marketing_opt_in age      gender  
0    NaN         basic               No  34      Female  
1   West         basic              Yes  29        Male  
2  North       PREMIUM              Yes  34  Non-Binary  
3    NaN           Pro               No  40        Male  
4   West       Premium               No  25       Other  

Rows, Columns: (300, 10)


In [5]:
# ---- Quick Audit ----
h("Basic Info")
print(df.info())

h("Sample rows")
display(df.head(10))

h("Null counts per column")
null_counts = df.isna().sum().sort_values(ascending=False)
display(null_counts)

h("Unique values (preview)")
for col in ["source", "plan_selected", "gender", "region", "marketing_opt_in"]:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].astype(str).str[:30].value_counts().head(10))




Basic Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       298 non-null    object
 1   name              291 non-null    object
 2   email             266 non-null    object
 3   signup_date       298 non-null    object
 4   source            291 non-null    object
 5   region            270 non-null    object
 6   plan_selected     292 non-null    object
 7   marketing_opt_in  290 non-null    object
 8   age               288 non-null    object
 9   gender            292 non-null    object
dtypes: object(10)
memory usage: 23.6+ KB
None

Sample rows


Unnamed: 0,customer_id,name,email,signup_date,source,region,plan_selected,marketing_opt_in,age,gender
0,CUST00000,Joshua Bryant,,,Instagram,,basic,No,34,Female
1,CUST00001,Nicole Stewart,nicole1@example.com,02-01-24,LinkedIn,West,basic,Yes,29,Male
2,CUST00002,Rachel Allen,rachel2@example.com,03-01-24,Google,North,PREMIUM,Yes,34,Non-Binary
3,CUST00003,Zachary Sanchez,zachary3@mailhub.org,04-01-24,YouTube,,Pro,No,40,Male
4,CUST00004,,matthew4@mailhub.org,05-01-24,LinkedIn,West,Premium,No,25,Other
5,CUST00005,John Gonzales,john5@mailhub.org,06-01-24,Facebook,South,Premium,No,34,Other
6,CUST00006,Crystal Mason,crystal6@mailhub.org,07-01-24,YouTube,North,UnknownPlan,Yes,40,male
7,CUST00007,Michael Bailey,michael7@mailhub.org,08-01-24,YouTube,Central,Pro,Yes,60,Other
8,CUST00008,Bianca Morris,bianca8@example.com,09-01-24,Referral,West,Pro,Yes,25,male
9,CUST00009,Cindy Anderson,,10-01-24,Google,East,PREMIUM,No,29,FEMALE



Null counts per column


email               34
region              30
age                 12
marketing_opt_in    10
name                 9
source               9
plan_selected        8
gender               8
customer_id          2
signup_date          2
dtype: int64


Unique values (preview)

source:
source
YouTube      58
Google       50
Instagram    49
Referral     49
Facebook     40
LinkedIn     39
nan           9
??            6
Name: count, dtype: int64

plan_selected:
plan_selected
Premium        57
Pro            53
basic          46
Basic          46
PREMIUM        42
PRO            41
nan             8
UnknownPlan     6
prem            1
Name: count, dtype: int64

gender:
gender
Other         59
FEMALE        52
male          48
Male          44
Non-Binary    42
Female        41
nan            8
123            6
Name: count, dtype: int64

region:
region
North      65
East       61
South      59
West       46
Central    39
nan        30
Name: count, dtype: int64

marketing_opt_in:
marketing_opt_in
No     156
Yes    133
nan     10
Nil      1
Name: count, dtype: int64


In [6]:
# ---- Standardisation Helpers ----

def clean_text(x):
    """Lowercase, strip spaces, collapse internal whitespace."""
    if pd.isna(x):
        return x
    x = str(x).strip().lower()
    x = re.sub(r"\s+", " ", x)
    return x

# Plan normalisation map
PLAN_MAP = {
    "basic": "Basic", "básic": "Basic", "basic ": "Basic", "basic plan": "Basic",
    "pro": "Pro", " pro": "Pro", "pro ": "Pro", "PRO": "Pro",
    "premium": "Premium", "prem": "Premium", "premium ": "Premium"
}

# Gender normalisation map
GENDER_MAP = {
    "m": "Male", "male": "Male",
    "f": "Female", "female": "Female",
    "nonbinary": "Non-binary", "non-binary": "Non-binary", "nb": "Non-binary",
    "other": "Other", "prefer not to say": "Prefer not to say",
    "unknown": "Unknown", "na": "Unknown", "n/a": "Unknown", "": "Unknown"
}

# Source normalisation map (common marketing sources)
SOURCE_MAP = {
    "google": "Google", "google ads": "Google", "goog": "Google",
    "instagram": "Instagram", "ig": "Instagram",
    "referral": "Referral", "friend": "Referral", "word of mouth": "Referral",
    "facebook": "Facebook", "fb": "Facebook",
    "linkedin": "LinkedIn",
    "twitter": "Twitter", "x": "Twitter"
}

# Marketing opt-in - "Yes"/"No"
def normalize_opt_in(x):
    if pd.isna(x):
        return np.nan
    v = clean_text(x)
    if v in {"y","yes","true","1"}:
        return "Yes"
    if v in {"n","no","false","0"}:
        return "No"
    return np.nan

EMAIL_REGEX = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")

def is_valid_email(x):
    if pd.isna(x):
        return False
    return bool(EMAIL_REGEX.match(str(x).strip()))








In [9]:
# ---- Clean & Standardise ----

df_raw = df.copy()

# Dates
h("Convert signup_date to datetime")
df["signup_date"] = pd.to_datetime(df["signup_date"], errors="coerce")
print("signup_date nulls after conversion:", df["signup_date"].isna().sum())

# Trim/normalise key text fields before mapping
for c in ["plan_selected","gender","source","region","name","email"]:
    if c in df.columns:
        df[c] = df[c].apply(lambda x: str(x).strip() if not pd.isna(x) else x)

# Standardise plan_selected
if "plan_selected" in df.columns:
    plan_before = df["plan_selected"].copy()
    df["plan_selected_clean"] = (df["plan_selected"]
                                 .apply(clean_text)
                                 .map(PLAN_MAP)
                                 .fillna(df["plan_selected"].str.strip().str.title()))
    plan_changed = (plan_before != df["plan_selected_clean"]).sum()
    print("plan_selected values changed:", plan_changed)

# Standardise gender
if "gender" in df.columns:
    gender_before = df["gender"].copy()
    df["gender_clean"] = (df["gender"].apply(clean_text).map(GENDER_MAP)
                          .fillna(df["gender"].str.strip().str.title()))
    gender_changed = (gender_before != df["gender_clean"]).sum()
    print("gender values changed:", gender_changed)

# Standardise source
if "source" in df.columns:
    source_before = df["source"].copy()
    df["source_clean"] = (df["source"].apply(clean_text).map(SOURCE_MAP)
                          .fillna(df["source"].str.strip().str.title()))
    source_changed = (source_before != df["source_clean"]).sum()
    print("source values changed:", source_changed)

# Marketing opt-in
if "marketing_opt_in" in df.columns:
    opt_before = df["marketing_opt_in"].copy()
    df["marketing_opt_in_clean"] = df["marketing_opt_in"].apply(normalize_opt_in)
    opt_changed = (opt_before != df["marketing_opt_in_clean"]).sum()
    print("marketing_opt_in values changed:", opt_changed)

# Email validation
if "email" in df.columns:
    df["email_valid"] = df["email"].apply(is_valid_email)

# Age: coerce to numeric, set unrealistic (e.g., < 13 or > 100) to NaN
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df.loc[(df["age"] < 13) | (df["age"] > 100), "age"] = np.nan

# Region: fill missing/empty with 'Unknown'
if "region" in df.columns:
    df["region_clean"] = df["region"].replace({"": np.nan})
    df["region_clean"] = df["region_clean"].fillna("Unknown").str.title()

# Deduplicate by customer_id 
dupes_removed = 0
if "customer_id" in df.columns:
    before_rows = len(df)
    df = df.sort_values(by=["customer_id","signup_date"], ascending=[True, True])

    df = df.drop_duplicates(subset=["customer_id"], keep="last")
    dupes_removed = before_rows - len(df)
    print("Duplicates removed (by customer_id):", dupes_removed)


# Keeping only the cleaned columns for downstream analysis
cols_to_use = ["customer_id","name","email","email_valid","signup_date",
               "source_clean","region_clean","plan_selected_clean",
               "marketing_opt_in_clean","age","gender_clean"]
df_clean = df[cols_to_use].rename(columns={
    "source_clean": "source",
    "region_clean": "region",
    "plan_selected_clean": "plan_selected",
    "marketing_opt_in_clean": "marketing_opt_in",
    "gender_clean": "gender"
})

h("Post-clean preview")
display(df_clean.head())
print("\nRows, Columns:", df_clean.shape)





Convert signup_date to datetime
signup_date nulls after conversion: 6
plan_selected values changed: 144
gender values changed: 150
source values changed: 67
marketing_opt_in values changed: 11
Duplicates removed (by customer_id): 1

Post-clean preview


Unnamed: 0,customer_id,name,email,email_valid,signup_date,source,region,plan_selected,marketing_opt_in,age,gender
0,CUST00000,Joshua Bryant,,False,NaT,Instagram,Unknown,Basic,No,34.0,Female
1,CUST00001,Nicole Stewart,nicole1@example.com,True,2024-02-01,LinkedIn,West,Basic,Yes,29.0,Male
2,CUST00002,Rachel Allen,rachel2@example.com,True,2024-03-01,Google,North,Premium,Yes,34.0,Non-binary
3,CUST00003,Zachary Sanchez,zachary3@mailhub.org,True,2024-04-01,Youtube,Unknown,Pro,No,40.0,Male
4,CUST00004,,matthew4@mailhub.org,True,2024-05-01,LinkedIn,West,Premium,No,25.0,Other



Rows, Columns: (299, 11)


In [14]:
# --- Data Quality Summary ---

h("Missing counts & percentages")
missing_counts = df_clean.isna().sum()
missing_pct = (missing_counts / len(df_clean) * 100).round(2)
dq_summary = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_pct": missing_pct
}).sort_values("missing_pct", ascending=False)
display(dq_summary)

h("Duplicates removed")
print(dupes_removed)

# --- Inconsistent category values corrected (simpler + safer) ---

def unique_diff(raw_col, clean_col):
    raw_unique = set(raw_col.dropna().astype(str).str.strip().unique())
    clean_unique = set(clean_col.dropna().astype(str).str.strip().unique())
    return len(raw_unique - clean_unique)  # how many old messy categories disappeared

plan_fixed = unique_diff(df_raw["plan_selected"], df_clean["plan_selected"])
gender_fixed = unique_diff(df_raw["gender"], df_clean["gender"])
source_fixed = unique_diff(df_raw["source"], df_clean["source"])
optin_fixed = unique_diff(df_raw["marketing_opt_in"], df_clean["marketing_opt_in"])

h("Inconsistent category values corrected (counts)")
print({
    "plan_selected": plan_fixed,
    "gender": gender_fixed,
    "source": source_fixed,
    "marketing_opt_in": optin_fixed
})


    


Missing counts & percentages


Unnamed: 0,missing_count,missing_pct
email,34,11.37
age,20,6.69
marketing_opt_in,11,3.68
name,9,3.01
source,9,3.01
plan_selected,8,2.68
gender,8,2.68
signup_date,6,2.01
customer_id,1,0.33
email_valid,0,0.0



Duplicates removed
1

Inconsistent category values corrected (counts)
{'plan_selected': 5, 'gender': 3, 'source': 1, 'marketing_opt_in': 1}


In [15]:
# ---- Summary Outputs ----

# Sign-ups per week
h("Sign-ups per week (Mon-based weeks)")
weekly = (df_clean.dropna(subset=["signup_date"])
          .groupby(pd.Grouper(key="signup_date", freq="W-MON"))["customer_id"]
          .nunique()
          .reset_index(name="signups"))
display(weekly.head(12))

# Sign-ups by source/region/plan_selected
h("Sign-ups by source")
by_source = df_clean["source"].value_counts(dropna=False).reset_index()
by_source.columns = ["source","signups"]
display(by_source)

h("Sign-ups by region")
by_region = df_clean["region"].value_counts(dropna=False).reset_index()
by_region.columns = ["region","signups"]
display(by_region)

h("Sign-ups by plan_selected")
by_plan = df_clean["plan_selected"].value_counts(dropna=False).reset_index()
by_plan.columns = ["plan_selected","signups"]
display(by_plan)

# Marketing opt-in counts by gender
h("Marketing opt-in counts by gender")
opt_by_gender = (df_clean
                 .groupby(["gender","marketing_opt_in"])["customer_id"]
                 .count()
                 .reset_index(name="count"))
display(opt_by_gender)

# Age summary
h("Age summary")
age_summary = df_clean["age"].agg(["min","max","mean","median"])
age_nulls = df_clean["age"].isna().sum()
print(age_summary)
print("Null age count:", age_nulls)


Sign-ups per week (Mon-based weeks)


Unnamed: 0,signup_date,signups
0,2024-01-08,7
1,2024-01-15,5
2,2024-01-22,7
3,2024-01-29,7
4,2024-02-05,8
5,2024-02-12,5
6,2024-02-19,7
7,2024-02-26,7
8,2024-03-04,7
9,2024-03-11,6



Sign-ups by source


Unnamed: 0,source,signups
0,Youtube,58
1,Google,50
2,Instagram,49
3,Referral,49
4,Facebook,40
5,LinkedIn,38
6,,9
7,??,6



Sign-ups by region


Unnamed: 0,region,signups
0,North,65
1,East,61
2,South,58
3,West,46
4,Central,39
5,Unknown,30



Sign-ups by plan_selected


Unnamed: 0,plan_selected,signups
0,Premium,100
1,Pro,93
2,Basic,92
3,,8
4,Unknownplan,6



Marketing opt-in counts by gender


Unnamed: 0,gender,marketing_opt_in,count
0,123,No,3
1,123,Yes,3
2,Female,No,47
3,Female,Yes,44
4,Male,No,50
5,Male,Yes,37
6,Non-binary,No,20
7,Non-binary,Yes,19
8,Other,No,32
9,Other,Yes,24



Age summary
min       21.00000
max       60.00000
mean      35.53405
median    34.00000
Name: age, dtype: float64
Null age count: 20


In [17]:
# ---- Business Questions ---
!pip install scipy


# Helper: last full calendar month present in the data
max_date = df_clean["signup_date"].max()
last_full_month_start = (max_date.replace(day=1) - pd.offsets.MonthBegin(1))
last_full_month_end = last_full_month_start + pd.offsets.MonthEnd(1)

h("Last full month window")
print(last_full_month_start.date(), "to", last_full_month_end.date())

last_month_mask = (df_clean["signup_date"] >= last_full_month_start) & \
                  (df_clean["signup_date"] <= last_full_month_end)
df_last_month = df_clean[last_month_mask]

# Q1. Which acquisition source brought in the most users last month?
h("Q1: Top acquisition source last month")
source_last_month = (df_last_month["source"]
                     .value_counts()
                     .reset_index())
source_last_month.columns = ["source","signups"]
display(source_last_month)
top_source_last_month = source_last_month.iloc[0] if not source_last_month.empty else None
print("Answer (data-driven):", dict(top_source_last_month) if top_source_last_month is not None else "No data for last month")

# Q2. Which region shows signs of missing/incomplete data?
# Treating region 'Unknown' or NaN as incomplete.
h("Q2: Regions with missing/incomplete values")
region_incomplete = df_clean["region"].isna().sum() + (df_clean["region"] == "Unknown").sum()
by_region_incomplete = (df_clean
                        .assign(region_status=np.where((df_clean["region"].isna()) | (df_clean["region"]=="Unknown"),
                                                       "Incomplete","OK"))
                        ["region_status"].value_counts())
print("Total incomplete region rows:", int(region_incomplete))
display((df_clean["region"].value_counts(dropna=False).reset_index()
         .rename(columns={"index":"region","region":"rows"})))

# Q3. Are older users more or less likely to opt in to marketing?
#Approach: convert opt_in to 1 (Yes) / 0 (No), then check correlation and opt-in rate by age bins
h("Q3: Age vs Marketing Opt-in")
df_age_opt = df_clean[["age","marketing_opt_in"]].dropna()
df_age_opt = df_age_opt.assign(opt_in=(df_age_opt["marketing_opt_in"]=="Yes").astype(int))

# Spearman correlation 
corr = df_age_opt["age"].corr(df_age_opt["opt_in"], method="spearman")
print("Spearman corr(age, opt_in):", round(corr, 3))

# Also show opt-in rates by age bands
bins = [13, 18, 25, 35, 45, 55, 65, 100]
labels = ["13-17","18-24","25-34","35-44","45-54","55-64","65+"]
df_age_opt["age_band"] = pd.cut(df_age_opt["age"], bins=bins, labels=labels, right=True, include_lowest=True)
opt_rate_by_band = (df_age_opt
                    .groupby("age_band")["opt_in"]
                    .mean()
                    .reset_index(name="opt_in_rate"))
display(opt_rate_by_band)

# Q4. Which plan is most commonly selected, and by which age group?
h("Q4: Most common plan and by age group")
plan_counts = df_clean["plan_selected"].value_counts().reset_index()
plan_counts.columns = ["plan_selected","signups"]
display(plan_counts)

df_plan_age = df_clean[["plan_selected","age"]].dropna()
df_plan_age["age_band"] = pd.cut(df_plan_age["age"], bins=bins, labels=labels, right=True, include_lowest=True)
plan_by_age = (df_plan_age
               .groupby(["plan_selected","age_band"])["age"]
               .count()
               .reset_index(name="count")
               .sort_values(["plan_selected","count"], ascending=[True,False]))
display(plan_by_age)

Collecting scipy
  Downloading scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: scipy
Successfully installed scipy-1.16.2

Last full month window
2024-11-01 to 2024-11-30

Q1: Top acquisition source last month


Unnamed: 0,source,signups
0,Google,3
1,Instagram,2
2,Referral,1
3,Facebook,1
4,LinkedIn,1


Answer (data-driven): {'source': 'Google', 'signups': np.int64(3)}

Q2: Regions with missing/incomplete values
Total incomplete region rows: 30


Unnamed: 0,rows,count
0,North,65
1,East,61
2,South,58
3,West,46
4,Central,39
5,Unknown,30



Q3: Age vs Marketing Opt-in
Spearman corr(age, opt_in): 0.039


  .groupby("age_band")["opt_in"]


Unnamed: 0,age_band,opt_in_rate
0,13-17,
1,18-24,0.430556
2,25-34,0.44186
3,35-44,0.489796
4,45-54,0.489362
5,55-64,0.428571
6,65+,



Q4: Most common plan and by age group


Unnamed: 0,plan_selected,signups
0,Premium,100
1,Pro,93
2,Basic,92
3,Unknownplan,6


  .groupby(["plan_selected","age_band"])["age"]


Unnamed: 0,plan_selected,age_band,count
1,Basic,18-24,28
2,Basic,25-34,23
4,Basic,45-54,14
3,Basic,35-44,11
5,Basic,55-64,5
0,Basic,13-17,0
6,Basic,65+,0
9,Premium,25-34,31
8,Premium,18-24,23
10,Premium,35-44,23


In [18]:
# ---- Optional Stretch: Support Tickets ----
try:
    tickets = pd.read_csv("/Users/pedropalate/Desktop/support_tickets.csv")
    h("Loaded: support_tickets.csv")
    display(tickets.head())

    tickets["ticket_date"] = pd.to_datetime(tickets["ticket_date"], errors="coerce")
    # Join on customer_id
    joined = pd.merge(df_clean, tickets, on="customer_id", how="left", suffixes=("", "_ticket"))

    # Within 2 weeks of sign-up (0-14 days)
    within_2w = joined.dropna(subset=["ticket_date","signup_date"]).copy()
    within_2w["days_since_signup"] = (within_2w["ticket_date"] - within_2w["signup_date"]).dt.days
    within_2w = within_2w[(within_2w["days_since_signup"] >= 0) & (within_2w["days_since_signup"] <= 14)]

    h("Customers contacting support within 2 weeks of sign-up")
    customers_2w = within_2w["customer_id"].nunique()
    total_customers = df_clean["customer_id"].nunique()
    print("Count:", customers_2w, " | Share:", round(customers_2w/total_customers*100, 2), "%")

    h("Support activity by plan and region (counts)")
    support_by_plan_region = (within_2w
                              .groupby(["plan_selected","region"])["ticket_id"]
                              .nunique()
                              .reset_index(name="support_tickets_2w"))
    display(support_by_plan_region)

    # Probability of contacting support by plan (any time, not just 2w) 
    any_support = (joined.assign(has_ticket=~joined["ticket_id"].isna())
                   .groupby("plan_selected")["has_ticket"]
                   .mean()
                   .reset_index(name="share_with_support_ticket"))
    h("Share contacting support (any time) by plan")
    display(any_support)

except FileNotFoundError:
    h("Support tickets file not found – skipping stretch section")

    


Loaded: support_tickets.csv


Unnamed: 0,ticket_id,customer_id,ticket_date,issue_type,resolved
0,TKT0000-1,CUST00203,2024-08-17,Billing,Yes
1,TKT0000-2,CUST00203,2024-07-22,Technical Error,Yes
2,TKT0000-3,CUST00203,2024-07-22,Other,Yes
3,TKT0001-1,CUST00266,2024-09-26,Account Setup,Yes
4,TKT0001-2,CUST00266,2024-10-09,Technical Error,No



Customers contacting support within 2 weeks of sign-up
Count: 47  | Share: 15.77 %

Support activity by plan and region (counts)


Unnamed: 0,plan_selected,region,support_tickets_2w
0,Basic,Central,2
1,Basic,East,6
2,Basic,North,2
3,Basic,South,9
4,Basic,Unknown,1
5,Basic,West,4
6,Premium,Central,4
7,Premium,East,1
8,Premium,North,3
9,Premium,South,1



Share contacting support (any time) by plan


Unnamed: 0,plan_selected,share_with_support_ticket
0,Basic,0.368421
1,Premium,0.22807
2,Pro,0.405172
3,Unknownplan,0.5


In [19]:
# ---- Export Clean Data & Tables ----

import os
os.makedirs("outputs", exist_ok=True)

df_clean.to_csv("outputs/customer_signups_clean.csv", index=False)
weekly.to_csv("outputs/signups_weekly.csv", index=False)
by_source.to_csv("outputs/signups_by_source.csv", index=False)
by_region.to_csv("outputs/signups_by_region.csv", index=False)
by_plan.to_csv("outputs/signups_by_plan.csv", index=False)
opt_by_gender.to_csv("outputs/opt_in_by_gender.csv", index=False)

# These are useful for screenshots for the PDF
age_summary.to_frame("value").to_csv("outputs/age_summary.csv")
dq_summary.to_csv("outputs/data_quality_summary.csv")

print("Exported to /outputs")

Exported to /outputs
