# SSA Popular Baby Names

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_latimes as lat
import datetime as dt
import glob
import os

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

### Download latest batch of names data from SSA

In [5]:
!curl -s 'https://www.ssa.gov/oact/babynames/names.zip' --output data/raw/names.zip

In [6]:
!unzip -o -q 'data/raw/names.zip' -d 'data/raw/years'

In [7]:
!cat data/raw/years/*.txt > 'data/raw/all.txt'

---

In [8]:
path = "data/raw/years"
all_files = glob.glob(os.path.join(path, "*.txt"))

df_from_each_file = (
    pd.read_csv(
        f,
        encoding="ISO-8859-1",
        header=None,
        sep=",",
        low_memory=False,
        names=["name", "sex", "count"],
    ).assign(year=f)
    for f in all_files
)
names = pd.concat(df_from_each_file, ignore_index=True)

In [9]:
names["year"] = (
    names["year"]
    .str.replace("data/raw/years/yob", "", regex=True)
    .str.replace(".txt", "", regex=True)
)

---

### Get birth totals for normalization

In [10]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

In [11]:
births = pd.read_html(url)[0]

In [12]:
births.rename(
    columns={
        "Year of birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
    inplace=True,
)

In [13]:
births.year = births.year.astype(str)

--- 

### Merge the dataframes together

In [14]:
df = pd.merge(names, births, on="year")

In [15]:
df.head()

Unnamed: 0,name,sex,count,year,male,female,total
0,Emily,F,25957,2000,2087895,1995340,4083235
1,Hannah,F,23084,2000,2087895,1995340,4083235
2,Madison,F,19968,2000,2087895,1995340,4083235
3,Ashley,F,17997,2000,2087895,1995340,4083235
4,Sarah,F,17706,2000,2087895,1995340,4083235


### Calculate rate

In [16]:
df["prop"] = df["count"] * 1.0 / df["total"]
df["fprop"] = df[df["sex"] == "F"]["count"] / df[df["sex"] == "F"]["female"]
df["mprop"] = df[df["sex"] == "M"]["count"] / df[df["sex"] == "M"]["male"]

In [17]:
df.head()

Unnamed: 0,name,sex,count,year,male,female,total,prop,fprop,mprop
0,Emily,F,25957,2000,2087895,1995340,4083235,0.006357,0.013009,
1,Hannah,F,23084,2000,2087895,1995340,4083235,0.005653,0.011569,
2,Madison,F,19968,2000,2087895,1995340,4083235,0.00489,0.010007,
3,Ashley,F,17997,2000,2087895,1995340,4083235,0.004408,0.00902,
4,Sarah,F,17706,2000,2087895,1995340,4083235,0.004336,0.008874,


---

### Export

In [18]:
df.to_csv("data/processed/names_births.csv", index=False)