In [None]:
import pandas as pd
import altair as alt
import zipfile
from pathlib import Path
from urllib.request import urlretrieve

def get_babynames_data() -> pd.DataFrame:
    """Download and parse SSA baby names data."""
    data_dir = Path('data')
    data_dir.mkdir(exist_ok=True)
    zip_path = data_dir / 'names.zip'

    if not zip_path.exists():
        url = "https://www.ssa.gov/oact/babynames/names.zip"
        urlretrieve(url, zip_path)

    dfs: list[pd.DataFrame] = []
    with zipfile.ZipFile(zip_path) as zf:
        for filename in zf.namelist():
            if filename.startswith('yob') and filename.endswith('.txt'):
                with zf.open(filename) as f:
                    df = pd.read_csv(f, names=['name', 'sex', 'n'])
                    df['year'] = int(filename[3:7])
                    dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df['prop'] = df.groupby(['year', 'sex'])['n'].transform(lambda x: x / x.sum())
    return df

babynames = get_babynames_data()
latest_year = babynames['year'].max()
print(f"Data loaded through {latest_year}")
print("Shuhan made changes here")


Data loaded through 2023


In [None]:
babynames.head()


Unnamed: 0,name,sex,n,year,prop
0,Mary,F,7065,1880,0.077642
1,Anna,F,2604,1880,0.028617
2,Emma,F,2003,1880,0.022012
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188


In [None]:
babynames.tail()

Unnamed: 0,name,sex,n,year,prop
2117214,Zyell,M,5,2023,3e-06
2117215,Zyen,M,5,2023,3e-06
2117216,Zymirr,M,5,2023,3e-06
2117217,Zyquan,M,5,2023,3e-06
2117218,Zyrin,M,5,2023,3e-06


In [None]:
babynames.info()
babynames.describe()

# Group by year and sex, then count the number of names
babynames.groupby(['year', 'sex']).size().unstack()

<class 'pandas.DataFrame'>
RangeIndex: 2117219 entries, 0 to 2117218
Data columns (total 5 columns):
 #   Column  Dtype  
---  ------  -----  
 0   name    str    
 1   sex     str    
 2   n       int64  
 3   year    int64  
 4   prop    float64
dtypes: float64(1), int64(2), str(2)
memory usage: 80.8 MB


sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,942,1058
1881,938,996
1882,1028,1099
1883,1054,1030
1884,1172,1125
...,...,...
2019,18008,14118
2020,17504,14046
2021,17649,14080
2022,17731,14311


In [None]:
print(f"unique names: {babynames['name'].nunique():,}")

unique names: 103,564


In [None]:
print(f"total babies: {babynames['n'].sum():,}")


total babies: 368,636,238


In [None]:
james_df = babynames[babynames['name'] == "James"]

alt.Chart(james_df).mark_line().encode(
    x='year:Q',
    y=alt.Y('n:Q', title='Number of Babies'),
    color='sex:N'
).properties(
    title='Popularity of the name "James" over time',
    width=600
)

Top 10 Names Analysis

In [None]:
top10 = (babynames.groupby(['sex', 'name'])['n']
        .sum()
        .reset_index()
        .sort_values('n',ascending=False)
        .groupby('sex')
        .head(10))

print("top 10 female names:")
display(top10[top10['sex'] == 'F'])


top 10 female names:


Unnamed: 0,sex,name,n
44070,F,Mary,4136872
19714,F,Elizabeth,1674865
51223,F,Patricia,1573241
29342,F,Jennifer,1470608
39889,F,Linda,1454476
8336,F,Barbara,1436052
43119,F,Margaret,1259974
61490,F,Susan,1122984
18576,F,Dorothy,1110746
56351,F,Sarah,1092927


In [None]:
print("top 10 male names:")
display(top10[top10['sex'] == 'M'])

top 10 male names:


Unnamed: 0,sex,name,n
89258,M,James,5226569
91451,M,John,5166241
105574,M,Robert,4841968
99923,M,Michael,4410095
113086,M,William,4178180
80275,M,David,3662190
91882,M,Joseph,2654627
105288,M,Richard,2574402
77814,M,Charles,2423009
110394,M,Thomas,2344940


In [None]:
top10_female_names = top10[top10['sex'] == 'F']['name'].tolist()
female_trends = babynames[(babynames['sex'] == 'F') & (babynames['name'].isin(top10_female_names))]

alt.Chart(female_trends).mark_line().encode(
    x='year:Q',
    y=alt.Y('n:Q', title='Number of Babies'),
    color='name:N'
).properties(
    title='Top 10 Female Names Over Time',
    width=600
)

In [None]:
top10_male_names = top10[top10['sex'] == 'M']['name'].tolist()
male_trends = babynames[(babynames['sex'] == 'M') & (babynames['name'].isin(top10_male_names))]

alt.Chart(male_trends).mark_line().encode(
    x='year:Q',
    y=alt.Y('n:Q', title='Number of Babies'),
    color='name:N'
).properties(
    title='Top 10 Male Names Over Time',
    width=600
)



Recent Trends (Latest Year)

In [None]:
latest_df = (babynames[babynames['year'] == latest_year]
            .sort_values('prop', ascending=False)
            .head(10))

alt.Chart(latest_df).mark_bar().encode(
    x=alt.X('prop:Q', title=f'Proportion of Babies in {latest_year}'),
    y=alt.Y('name:N', sort='-x', title=''),
    color='sex:N'
).properties(
    title=f'Top 10 Names in {latest_year}',
    width=600
)



EXERCISES

In [None]:
#1.

top_names_latest = (
    babynames[babynames["year"] == latest_year]
    .sort_values(["sex", "prop"], ascending=[True, False])
    .groupby("sex", as_index=False)
    .head(10)
)

top_names_list = top_names_latest["name"].unique().tolist()

df = babynames[babynames["name"].isin(top_names_list)].copy()

latest_name_trends = babynames[babynames["name"].isin(top_names_list)].copy()

def sex_panel(sex_label: str, title: str):
    d = df[df["sex"] == sex_label]
    sel = alt.selection_point(fields=["name"], bind="legend")

    return(
        alt.Chart(d)
        .mark_line()
        .encode(
            x=alt.X("year:Q", title="Year"),
            y=alt.Y("prop:Q", title="Proportion of births"),
            color=alt.Color("name:N", title="Name"),
            opacity=alt.condition(sel, alt.value(1.0), alt.value(0.12)),
            tooltip=[
                alt.Tooltip("year:Q", title="Year"),
                alt.Tooltip("name:N", title="Name"),
                alt.Tooltip("prop:Q", title="Proportion of births", format=".4f"),
                alt.Tooltip("n:Q", title="Count", format=",")
            ],
        )

        .add_params(sel)
        .properties(title=title, width=720, height=260)
    )

chart_f = sex_panel("F", f"Female â€” top names in {latest_year} (by proportion)")
chart_m = sex_panel("M", f"Male â€” top names in {latest_year} (by proportion)")

alt.vconcat(chart_f, chart_m).properties(
    title=f"Top names in {latest_year} (by proportion) â€” trends over time"
)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
chart = (
    alt.Chart(latest_name_trends)
    .mark_line()
    .encode(
        x=alt.X("year:Q", title="Year"),
        y=alt.Y("prop:Q", title="Proportion of births"),
        color=alt.Color("name:N", title="Name"),
        facet=alt.Facet(
            "sex:N",
            title=None,
            columns=2,
            sort=["F", "M"],  # optional: force panel order
        ),
    )
    .properties(
        title=f"Top names in {latest_year} (by proportion) â€” trends over time",
        width=340,
        height=260,
    )
)


chart