In [1]:
import pandas as pd
import PyPDF2 as pdf

import re
import os
import requests
import yaml
import shutil

In [2]:
with open('./info.yaml') as file:
    info = yaml.load(file, Loader=yaml.FullLoader)
    file_links = info['pdf-file-links']

In [3]:
directory = 'source_files/'

if not os.path.exists(directory):
    os.makedirs(directory)

for num, link in enumerate(file_links):
    with requests.get(link, stream=True) as r:
        with open('./' + directory + 'data_file' + str(num) + '.pdf', 'wb') as f:
            f.write(r.content)

In [4]:
def get_rank_order():
    temp_rank_order = []
    nums_1 = [*range(1,51)]
    nums_2 = [*range(51,101)]

    for n in nums_1:
        temp_rank_order.append(n)
        temp_rank_order.append(nums_2[n - 1])
    return temp_rank_order


In [5]:
def get_from_list(names_list):
    final_list = {}
    
    rank_order = get_rank_order()
    
    for i, m in enumerate(names_list):
        rank = rank_order[i]
        split_name_pattern = '([A-Za-z]+)(\d{1,4})'
        split = re.match(split_name_pattern, m)
    
        details = {'name':split[1], 'count': int(split[2])}
        final_list[rank] = details
    
    return final_list

In [6]:
def get_data_from_pdf(pdf_page):
    pdf_text = pdf_page.extractText() \
        .replace("\n", "") \
        .replace("-", "") \
        .replace(" ", "")
    
    year_pattern = 'Names(\d{1,4})'
    year = re.findall(year_pattern, pdf_text)[0]
    year = int(year)
    
    middle_nums = [*range(51,101)]
    temp_males = []
    temp_females = []

    for ind, number in enumerate(middle_nums):

        first = str(ind + 1)
        middle = str(number)
        end = str(ind + 2)
        
        name_pat = '[A-Za-z]+\d{1,4}'
        
        pattern = first + '(' + name_pat + name_pat + ')' + middle + '(' + name_pat + name_pat + ')'
        
        pattern if number == 100 else pattern + end + '\w'
        
        prog = re.compile(pattern)
        result = prog.findall(pdf_text)

        for name in result[0]:
            new_pat = '(' + name_pat + ')(' + name_pat + ')'
            prog = re.compile(new_pat)
            result2 = prog.findall(name)
            temp_males.append(result2[0][0])
            temp_females.append(result2[0][1])
    
    males = get_from_list(temp_males)
    females = get_from_list(temp_females)
    
    return {year : {'male': males, 'female': females} }


In [7]:
final_data = {}
files = []

In [8]:
for filename in os.listdir(directory):
    if filename.endswith(".pdf"): 
         files.append(os.path.join(directory, filename))

for pdf_filename in files:
    temp_pdf = pdf.PdfFileReader(pdf_filename)
    num_pages = temp_pdf.getNumPages()
    
    for page_num in range(num_pages):
        page = temp_pdf.getPage(page_num)
        pdf_data = get_data_from_pdf(page)

        final_data.update(pdf_data)


In [9]:
data_array = []
for year, year_data in final_data.items():
    for gender, gender_data in year_data.items():
        for rank, rank_data in gender_data.items():
            row = [year, rank, gender, rank_data['name'], rank_data['count']]
            data_array.append(row)

In [10]:
baby_names_data = pd.DataFrame(data_array, columns=['year', 'rank', 'gender', 'name', 'count'])
baby_names_data.sort_values(['year', 'rank'], ignore_index=True, inplace=True)
baby_names_data

Unnamed: 0,year,rank,gender,name,count
0,1952,1,male,John,2205
1,1952,1,female,Christine,1346
2,1952,2,male,Peter,1903
3,1952,2,female,Susan,1170
4,1952,3,male,Robert,1787
...,...,...,...,...,...
13595,2019,98,female,MADELINE,7249
13596,2019,99,male,ISAIAH,94
13597,2019,99,female,HEIDI,7250
13598,2019,100,male,THEO,94
