## Data Wrangling
- V1 on 9/22/2025 8am ET

In [None]:
!pip install --quiet gdown pandas

In [None]:
import pandas as pd

#### Random Seed and Sample Size

In [None]:
RANDOM_STATE = 5678
SAMPLE_SIZE = 100  # Size per age bucket

#### Download 'train.tsv'

In [None]:
file_id = "1W3gdWpXUnjJn_nwTpvlWwLLAfF1YPwos"  # train.tsv
download_url = f"https://drive.google.com/uc?id={file_id}"
!gdown {download_url} -O train.tsv

In [None]:
train_df = pd.read_csv("train.tsv", sep="\t")
print(f"train_df.shape: {train_df.shape}")
train_df.head(2)

#### Filter to only rows having age

In [None]:
train_with_non_null_age = train_df[train_df["age"].notna()]
print(f"Removing nulls reduces to {train_with_non_null_age.shape[0]}/{train_df.shape[0]} rows")

#### Separate out dfs for each age bucket
- Exclude teens age bucket for COPPA considerations

In [None]:
# Get dataframe per age bucket of equal number of rows
age_order = ["twenties", "thirties", "fourties", "fifties", "sixties", "seventies", "eighties", "nineties"]
dfs_list = []
for age_bucket in age_order:
    temp_df = train_with_non_null_age[train_with_non_null_age["age"] == age_bucket]
    print(f"{age_bucket} filtered rows: {temp_df.shape[0]}")
    sample_df = temp_df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)
    print(f"{age_bucket} sample size rows: {sample_df.shape[0]}")
    dfs_list.append(sample_df)
# ct.reindex(age_order)

In [None]:
# Consolidate to single dataframe
sample_df = pd.concat(dfs_list, ignore_index=True)
print(f"Combined sample df has shape: {sample_df.shape}")
sample_df.head(2)

In [None]:
sample_df.to_csv("common_voices_sample1.csv")