# SSA Popular Baby Names

### Import Python tools and Jupyter configuration

In [3]:
%load_ext lab_black

In [4]:
import pandas as pd
import altair as alt
import altair_latimes as lat
import datetime as dt
import glob
import os

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [6]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

### Download latest batch of names data from SSA

In [7]:
# Silently download the file from the url into specific output directory
!curl -s 'https://www.ssa.gov/oact/babynames/names.zip' --output data/raw/names.zip

In [8]:
!unzip -o -q 'data/raw/names.zip' -d 'data/raw/years'

In [9]:
!cat data/raw/years/*.txt > 'data/raw/all.txt'

---

In [10]:
path = "data/raw/years"
all_files = glob.glob(os.path.join(path, "*.txt"))

df_from_each_file = (
    pd.read_csv(
        f,
        encoding="ISO-8859-1",
        header=None,
        sep=",",
        low_memory=False,
        names=["name", "sex", "count"],
    ).assign(year=f)
    for f in all_files
)
names = pd.concat(df_from_each_file, ignore_index=True)

In [11]:
names["year"] = (
    names["year"]
    .str.replace("data/raw/years/yob", "", regex=True)
    .str.replace(".txt", "", regex=True)
)

In [12]:
names.head()

Unnamed: 0,name,sex,count,year
0,Emily,F,25957,2000
1,Hannah,F,23084,2000
2,Madison,F,19968,2000
3,Ashley,F,17997,2000
4,Sarah,F,17706,2000


### Limit to names since 1900

In [16]:
names[names["year"].astype(int) >= 190]

Unnamed: 0,name,sex,count,year
0,Emily,F,25957,2000
1,Hannah,F,23084,2000
2,Madison,F,19968,2000
3,Ashley,F,17997,2000
4,Sarah,F,17706,2000
...,...,...,...,...
2020858,Zyheem,M,5,2019
2020859,Zykel,M,5,2019
2020860,Zyking,M,5,2019
2020861,Zyn,M,5,2019


---

### Get birth totals for normalization

In [None]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

In [None]:
births = pd.read_html(url)[0]

In [None]:
births.rename(
    columns={
        "Year of birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
    inplace=True,
)

In [None]:
births.year = births.year.astype(str)
births.head()

--- 

### Merge the dataframes together

In [None]:
df = pd.merge(names, births, on="year")

In [None]:
df.head()

### Calculate rate

In [None]:
df["prop"] = df["count"] * 1.0 / df["total"]
df["fprop"] = df[df["sex"] == "F"]["count"] / df[df["sex"] == "F"]["female"]
df["mprop"] = df[df["sex"] == "M"]["count"] / df[df["sex"] == "M"]["male"]

In [None]:
df.head()

---

### Export

In [None]:
df.to_csv("data/processed/names_births.csv", index=False)

In [None]:
names.to_csv("data/processed/names.csv", index=False)

In [None]:
births.to_csv("data/processed/births.csv", index=False)