# Initialization

In [126]:
import pandas as pd
import re
import numpy as np
import urllib
from bs4 import BeautifulSoup as Soup
import os
from pathlib import Path

In [140]:
speakers = pd.DataFrame(columns=["speaker_id", 
                                "first_name", 
                                "last_name", 
                                "type",
                                "party", 
                                "state", 
                                "district",
                                "bio_guide_id",
                                "congress_id"])

In [141]:
speakers

Unnamed: 0,speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id


In [142]:
speeches = pd.DataFrame(columns=["speech_id",
                                 "last_name",
                                 "speaker_id",
                                 "proceeding_id", 
                                 "topic_id", 
                                 "word_count", 
                                 "speech_text",
                                 "file_name",
                                 "mods_file"])

In [143]:
speeches

Unnamed: 0,speech_id,last_name,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name,mods_file


# Master Mods Parsing

In [144]:
def get_all_extensions(file):
    handler = open(file).read()
    soup = Soup(handler, "lxml")
    return soup.find_all("extension")

In [145]:
def get_cong_member_tag(cong_member_extension):
    cong_member_tag = cong_member_extension.find("congmember")
    return cong_member_tag

In [146]:
def get_party(cong_member_tag):
    try:
        return cong_member_tag.attrs['party']
    except:
        return 'N/A'
def get_type(cong_member_tag):
    try:
        return cong_member_tag.attrs['type']
    except:
        return 'N/A'
def get_authority_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['authorityid']
    except:
        return 'N/A'
def get_bioguide_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['bioguideid']
    except:
        return 'N/A'
def get_state(cong_member_tag):
    try:
        return cong_member_tag.attrs['state']
    except:
        return 'N/A'
def get_congress_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['congress']
    except:
        return 'N/A'

In [147]:
def get_district(cong_member_extension):
    district_tag = cong_member_extension.find("district")
    if district_tag == None:
        return None
    return district_tag.string

In [148]:
def get_first_name(cong_member_tag):
    name_tag = cong_member_tag.select("name[type='authority-fnf']")[0]
    if name_tag == []:
        print("no first_name")
        print(cong_member_tag)
        return 'N/A'
    first_name = name_tag.text.split()[0]
    return first_name

In [149]:
def get_last_name(cong_member_tag):
    name_tag = cong_member_tag.select("name[type='authority-lnf']")[0]
    if name_tag == []:
        print("no first_name")
        print(cong_member_tag)
        return 'N/A'
    full_name = name_tag.string
    return re.match("[^,]*", full_name).group(0).upper()

In [150]:
filenames = os.listdir("/Users/halliday/projects/searchlight/parsing/mastermods")
filenames.remove('.DS_Store')
for filename in filenames:
    extensions = get_all_extensions("/Users/halliday/projects/searchlight/parsing/mastermods/" + filename)
    print("            " + filename)
    for extension in extensions:
        cong_member_tag = get_cong_member_tag(extension)
        if cong_member_tag == None:
            continue
#         if filename == 'mods112.xml':
#             print(cong_member_tag)
        authority_id = get_authority_id(cong_member_tag)
        if speakers[speakers['speaker_id'] == authority_id]['speaker_id'].count() > 0 or authority_id == "" or authority_id == "N/A":
            continue
        cong_member_dict = {'speaker_id': authority_id, 
                            'first_name': get_first_name(cong_member_tag),
                            'last_name': get_last_name(cong_member_tag),
                            'type': get_type(cong_member_tag),
                            'party': get_party(cong_member_tag),
                            'state': get_state(cong_member_tag),
                            'district': get_district(extension),
                            'bio_guide_id': get_bioguide_id(cong_member_tag),
                            'congress_id': get_congress_id(cong_member_tag)}
        speakers = speakers.append(cong_member_dict, ignore_index=True)

            mods113.xml
            mods107.xml
            mods106.xml
            mods112.xml
authId not found
            mods110.xml
            mods111.xml
            mods105.xml
            mods114.xml
            mods108.xml
            mods109.xml


In [151]:
speakers = speakers.sort_values('last_name')

In [152]:
speakers.to_csv('masterspeakers.csv', index=False)

# Speeches Parsing

In [153]:
speakersDir = Path('/Users/halliday/projects/searchlight/parsing')
speakers = pd.read_csv(speakersDir/'updatedspeakers.csv')
speakers.head()

Unnamed: 0,speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id
0,2,Neil,ABERCROMBIE,REPRESENTATIVE,D,HI,1.0,A000014,107
1,1269,Spencer,ABRAHAM,SENATOR,R,MI,,A000355,106
2,2244,Ralph,ABRAHAM,REPRESENTATIVE,R,LA,5.0,A000374,114
3,4,Gary,ACKERMAN,REPRESENTATIVE,D,NY,5.0,A000022,107
4,2006,Sandy,ADAMS,REPRESENTATIVE,R,FL,24.0,A000366,112


In [154]:
def remove_space(regex):
    return regex.group().replace(' ', '')

In [155]:
def sep_speech(filepath):
    parse_file = ''
    with open(filepath) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('\n', '')
    parse_file = re.sub('Mr. [A-Z][a-z]', remove_space, parse_file)
    
    split = re.split(r'Mr. |Ms. |Mrs. ', parse_file)
    split.pop(0)
    name_and_speech = []
    for i in np.arange(len(split)):
        try:
            lastname = re.match('[A-Z]*\. ', split[i]).group(0)[:-2]
            name_and_speech += [lastname]
            value = re.sub('[A-Z]\w*\. ', '', split[i])
            name_and_speech += [value]
        except:
            continue
    return name_and_speech

In [156]:
def sep_date_from_file(file):
    abcdef = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', file)
    return re.split('-', abcdef[0])

In [157]:
def find_title(file_name):
    parse_file = ''
    with open(file_name) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    title = re.findall('[A-Z \'-]+[A-Z0-9-,\. ]*[Continued]*\\n', parse_file)
    return title[0].strip()

In [158]:
def fix_surname_typos(name):
    if name == 'SOUZZI':
        return 'SUOZZI'
    if name == 'VANHOLLEN':
        return 'VAN HOLLEN'
    if name == 'FISHCER':
        return 'FISCHER'
    return name

In [159]:
#Collect Speaker-Speech Pairs 
speech_count = 0
list_of_files = os.listdir("/Users/halliday/projects/searchlight/parsing/testparse")
# list_of_files.remove('.DS_Store')
for file in list_of_files:
    if file.endswith(".txt"):
        print(file)
        if file == 'CREC-2018-03-22-pt1-PgH1769-2.txt':
            continue
        if file == 'CREC-2017-09-06-pt1-PgH6695.txt':
            continue
        mods_file = file.replace('.txt', '.xml')
        separated = sep_speech("/Users/halliday/projects/searchlight/parsing/testparse" + "/" + file)
        i = 0
        while i < len(separated):
            separated_surname = fix_surname_typos(separated[i])
            text = separated[i+1]
            text = text.replace('MrPresident', 'Mr. President')
            if len(text) > 30:
                row = {"speech_id": speech_count,
                       "last_name": separated_surname,
                       "speaker_id": 99999999999999,
                       "proceeding_id": "proceeding_id", 
                       "topic_id": "topic_id",
                       "word_count": len(text.split()), 
                       "speech_text": text,
                       "file_name": file,
                       "mods_file": mods_file}
                speech_count += 1
                speeches = speeches.append(row, ignore_index=True)     
            i += 2
        print('finished with file ', speech_count)

CREC-2017-04-06-pt1-PgH2768-5.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2792-20.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-19.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-25.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2793-3.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-31.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2757-7.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-10.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-11.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2757-6.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-30.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-24.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2793-2.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-18.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2792-21.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2768-4.txt
finished with file  4
CREC-2017-04-06-pt1-PgH2768-6.txt
finished with file  5
CREC-2017-04-06-pt1-PgH2789.txt
finish

In [160]:
speeches.head()

Unnamed: 0,speech_id,last_name,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name,mods_file
0,0,BEYER,99999999999999,proceeding_id,topic_id,213,"Mr.Speaker, I rise today to express my constit...",CREC-2017-04-06-pt1-PgH2768-5.txt,CREC-2017-04-06-pt1-PgH2768-5.xml
1,1,RUIZ,99999999999999,proceeding_id,topic_id,200,"Mr.Speaker, Air Force Chief Master Sergeant Sh...",CREC-2017-04-06-pt1-PgH2757-7.txt,CREC-2017-04-06-pt1-PgH2757-7.xml
2,2,STEFANIK,99999999999999,proceeding_id,topic_id,174,"Mr.Speaker, I rise today to praise a school in...",CREC-2017-04-06-pt1-PgH2757-6.txt,CREC-2017-04-06-pt1-PgH2757-6.xml
3,3,MESSER,99999999999999,proceeding_id,topic_id,165,"Mr.Speaker, Congress is leaving for Easter bre...",CREC-2017-04-06-pt1-PgH2768-4.txt,CREC-2017-04-06-pt1-PgH2768-4.xml
4,4,HILL,99999999999999,proceeding_id,topic_id,130,"Mr.Speaker, today I rise to recognize the Quap...",CREC-2017-04-06-pt1-PgH2768-6.txt,CREC-2017-04-06-pt1-PgH2768-6.xml


# Local Mods Parsing

In [161]:
def get_cong_member_tag_from_mods(last_name, mods_file_path):
    handler = open(mods_file_path).read()
    soup = Soup(handler, "lxml")
    cong_member_tags = soup.find_all("congmember")
    matched_cong_member_tag = None
    for i in range(len(cong_member_tags)):
        curr_last_name = get_last_name(cong_member_tags[i])
        if curr_last_name == last_name:
            matched_cong_member_tag = cong_member_tags[i]
            break
    return matched_cong_member_tag

In [162]:
def get_cong_member_info(last_name, mods_file_path):
    matched_cong_member_tag = get_cong_member_tag_from_mods(last_name, mods_file_path)
    if matched_cong_member_tag == None:
        return {'speaker_id': 'N/A', 
                'first_name': 'N/A',
                'last_name': last_name,
                'type': 'N/A',
                'party': 'N/A',
                'state': 'N/A',
                'district': 'N/A',
                'bio_guide_id': 'N/A',
                'congress_id': 'N/A'}
    else:
        return {'speaker_id': get_authority_id(matched_cong_member_tag), 
                'first_name': get_first_name(matched_cong_member_tag),
                'last_name': last_name,
                'type': get_type(matched_cong_member_tag),
                'party': get_party(matched_cong_member_tag),
                'state': get_state(matched_cong_member_tag),
                'district': 'N/A',
                'bio_guide_id': get_bioguide_id(matched_cong_member_tag),
                'congress_id': get_congress_id(matched_cong_member_tag)}

In [163]:
def get_authority_id_from_mods(last_name, mods_file_path):
    matched_cong_member_tag = get_cong_member_tag_from_mods(last_name, mods_file_path)
    if matched_cong_member_tag == None:
        return 99999999999999
    authority_id = get_authority_id(matched_cong_member_tag)
    if authority_id == "N/A":
        return 99999999999999
    else:
        return authority_id

In [167]:
def get_speaker_id(last_name, mods_file_path, speakers):
    possible_speakers = speakers[speakers['last_name'] == last_name]
    if possible_speakers.shape[0] == 0:
        new_speaker = get_cong_member_info(last_name, mods_file_path)
        speakers = speakers.append(new_speaker, ignore_index=True)
        speakers = speakers.sort_values('last_name')
        speakers.to_csv('updatedspeakers.csv', index=False)
        print("wrote in new speaker")
        print(new_speaker)
        return new_speaker['speaker_id']
    elif possible_speakers.shape[0] == 1:
        print('used existing row')
        return possible_speakers.iloc[0]['speaker_id']
    else:
        mods_speaker_id = get_authority_id_from_mods(last_name, mods_file_path)
        if mods_speaker_id > 100000:
            return 99999999999999
        matched_speaker = possible_speakers[possible_speakers['speaker_id'] == mods_speaker_id]
        if matched_speaker.shape[0] == 1:
            return matched_speaker.iloc[0]['speaker_id']
        elif matched_speaker.shape[0] == 0:
            new_speaker = get_cong_member_info(last_name, mods_file_path)
            speakers = speakers.append(new_speaker, ignore_index=True)
            print("wrote in new speaker")
            print(new_speaker)
            speakers = speakers.sort_values('last_name')
            speakers.to_csv('updatedspeakers.csv', index=False)
            return new_speaker['speaker_id']

In [168]:
#Populate Speaker_Id Column
speakersDir = Path('/Users/halliday/projects/searchlight/parsing')
speakers = pd.read_csv(speakersDir/'updatedspeakers.csv')
for i in range(speeches.shape[0]):
    curr_row = speeches.iloc[i]
    last_name = curr_row['last_name']
    mods_file_path = "/Users/halliday/projects/searchlight/parsing/testparse" + "/" + curr_row['mods_file']
    speaker_id = get_speaker_id(last_name, mods_file_path, speakers)
    print(speaker_id)
    break

used existing row
2272


In [166]:
speakersDir = Path('/Users/halliday/projects/searchlight/parsing')
speakers = pd.read_csv(speakersDir/'masterspeakers.csv')
speakers[speakers['speaker_id'] == 2272]

Unnamed: 0,speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id
72,2272,Donald,BEYER,REPRESENTATIVE,D,VA,8.0,B001292,114
