# Initialization

In [126]:
import pandas as pd
import re
import numpy as np
import urllib
from bs4 import BeautifulSoup as Soup
import os
from pathlib import Path

In [363]:
speakers = pd.DataFrame(columns=["speaker_id", 
                                "first_name", 
                                "last_name",
                                "chamber",
                                "type",
                                "party", 
                                "state", 
                                "district",
                                "bio_guide_id",
                                "congress_id"])

In [364]:
speakers

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id


In [365]:
speeches = pd.DataFrame(columns=["speech_id",
                                 "last_name",
                                 "speaker_id",
                                 "proceeding_id", 
                                 "topic_id", 
                                 "word_count", 
                                 "speech_text",
                                 "file_name",
                                 "mods_file"])

In [366]:
speeches

Unnamed: 0,speech_id,last_name,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name,mods_file


# Master Mods Parsing

In [295]:
def get_all_extensions(file):
    handler = open(file).read()
    soup = Soup(handler, "lxml")
    return soup.find_all("extension")

In [296]:
def get_cong_member_tag(cong_member_extension):
    cong_member_tag = cong_member_extension.find("congmember")
    return cong_member_tag

In [297]:
def get_party(cong_member_tag):
    try:
        return cong_member_tag.attrs['party']
    except:
        return 'N/A'
def get_type(cong_member_tag):
    try:
        return cong_member_tag.attrs['type']
    except:
        return 'N/A'
def get_authority_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['authorityid']
    except:
        print("auth id not found")
        return None
def get_bioguide_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['bioguideid']
    except:
        return 'N/A'
def get_state(cong_member_tag):
    try:
        return cong_member_tag.attrs['state']
    except:
        return 'N/A'
def get_congress_id(cong_member_tag):
    try:
        return cong_member_tag.attrs['congress']
    except:
        return 'N/A'
def get_chamber(cong_member_tag):
    chambers = {'S': 'SENATE', 'H': 'HOUSE'}
    try:
        letter = cong_member_tag.attrs['chamber']
        return chambers[letter]
    except:
        return 'N/A'

In [298]:
def get_district(cong_member_extension):
    district_tag = cong_member_extension.find("district")
    if district_tag == None:
        return None
    return district_tag.string

In [299]:
def get_first_name(cong_member_tag):
    name_tag = cong_member_tag.select("name[type='authority-fnf']")[0]
    if name_tag == []:
        print("no first_name")
        print(cong_member_tag)
        return 'N/A'
    first_name = name_tag.text.split()[0]
    return first_name

In [300]:
def get_last_name(cong_member_tag):
    name_tag = cong_member_tag.select("name[type='authority-lnf']")[0]
    if name_tag == []:
        print("no first_name")
        print(cong_member_tag)
        return 'N/A'
    full_name = name_tag.string
    return re.match("[^,]*", full_name).group(0).upper()

In [301]:
filenames = os.listdir("/Users/halliday/projects/searchlight/parsing/mastermods")
filenames.remove('.DS_Store')
for filename in filenames:
    extensions = get_all_extensions("/Users/halliday/projects/searchlight/parsing/mastermods/" + filename)
    print("            " + filename)
    for extension in extensions:
        cong_member_tag = get_cong_member_tag(extension)
        if cong_member_tag == None:
            continue
#         if filename == 'mods112.xml':
#             print(cong_member_tag)
        authority_id = get_authority_id(cong_member_tag)
        if speakers[speakers['speaker_id'] == authority_id]['speaker_id'].count() > 0 or authority_id == "" or authority_id == None:
            continue
        cong_member_dict = {'speaker_id': authority_id, 
                            'first_name': get_first_name(cong_member_tag),
                            'last_name': get_last_name(cong_member_tag),
                            'chamber': get_chamber(cong_member_tag),
                            'type': get_type(cong_member_tag),
                            'party': get_party(cong_member_tag),
                            'state': get_state(cong_member_tag),
                            'district': get_district(extension),
                            'bio_guide_id': get_bioguide_id(cong_member_tag),
                            'congress_id': get_congress_id(cong_member_tag)}
        speakers = speakers.append(cong_member_dict, ignore_index=True)

            mods113.xml
            mods107.xml
            mods106.xml
            mods112.xml
auth id not found
            mods110.xml
            mods111.xml
            mods105.xml
            mods114.xml
            mods108.xml
            mods109.xml


In [302]:
speakers.head()

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id
0,1049,Richard,SHELBY,SENATE,SENATOR,R,AL,,S000320,113
1,1548,Jeff,SESSIONS,SENATE,SENATOR,R,AL,,S001141,113
2,2197,Bradley,BYRNE,HOUSE,REPRESENTATIVE,R,AL,1.0,B001289,113
3,1986,Martha,ROBY,HOUSE,REPRESENTATIVE,R,AL,2.0,R000591,113
4,1704,Mike,ROGERS,HOUSE,REPRESENTATIVE,R,AL,3.0,R000575,113


In [319]:
speakers[speakers['congress_id'] == 'N/A']

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id


In [320]:
speakers = speakers.sort_values('last_name')

In [321]:
speakers.to_csv('masterspeakers.csv', index=False)

# Speeches Parsing

In [367]:
speakersDir = Path('/Users/halliday/projects/searchlight/parsing')
speakers = pd.read_csv(speakersDir/'updatedspeakers.csv')
speakers.head()

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id
0,2,Neil,ABERCROMBIE,HOUSE,REPRESENTATIVE,D,HI,1.0,A000014,107
1,1269,Spencer,ABRAHAM,SENATE,SENATOR,R,MI,,A000355,106
2,2244,Ralph,ABRAHAM,HOUSE,REPRESENTATIVE,R,LA,5.0,A000374,114
3,4,Gary,ACKERMAN,HOUSE,REPRESENTATIVE,D,NY,5.0,A000022,107
4,2006,Sandy,ADAMS,HOUSE,REPRESENTATIVE,R,FL,24.0,A000366,112


In [368]:
def remove_space(regex):
    return regex.group().replace(' ', '')

In [369]:
def sep_speech(filepath):
    parse_file = ''
    with open(filepath) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('\n', '')
    parse_file = re.sub('Mr. [A-Z][a-z]', remove_space, parse_file)
    
    split = re.split(r'Mr. |Ms. |Mrs. ', parse_file)
    split.pop(0)
    name_and_speech = []
    for i in np.arange(len(split)):
        try:
            lastname = re.match('[A-Z]*\. ', split[i]).group(0)[:-2]
            name_and_speech += [lastname]
            value = re.sub('[A-Z]\w*\. ', '', split[i])
            name_and_speech += [value]
        except:
            continue
    return name_and_speech

In [370]:
def sep_date_from_file(file):
    abcdef = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', file)
    return re.split('-', abcdef[0])

In [492]:
def find_title(file_path):
    
    #missing regex filter to ignore {time}
    
    parse_file = ''
    with open(file_path) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    title = re.findall('[A-Z \'-]+[A-Z0-9-,\. ]*[Continued]*\\n', parse_file)
    return title[0].strip()

In [372]:
def fix_surname_typos(name):
    if name == 'SOUZZI':
        return 'SUOZZI'
    if name == 'VANHOLLEN':
        return 'VAN HOLLEN'
    if name == 'FISHCER':
        return 'FISCHER'
    return name

In [373]:
#Collect Speaker-Speech Pairs 
def collect_pairs():
    speech_count = 0
    list_of_files = os.listdir("/Users/halliday/projects/searchlight/parsing/testparse")
    # list_of_files.remove('.DS_Store')
    for file in list_of_files:
        if file.endswith(".txt"):
            print(file)
            if file == 'CREC-2018-03-22-pt1-PgH1769-2.txt':
                continue
            if file == 'CREC-2017-09-06-pt1-PgH6695.txt':
                continue
            mods_file = file.replace('.txt', '.xml')
            separated = sep_speech("/Users/halliday/projects/searchlight/parsing/testparse" + "/" + file)
            i = 0
            while i < len(separated):
                separated_surname = fix_surname_typos(separated[i])
                text = separated[i+1]
                text = text.replace('MrPresident', 'Mr. President')
                if len(text) > 30:
                    row = {"speech_id": speech_count,
                           "last_name": separated_surname,
                           "speaker_id": 99999999999999,
                           "proceeding_id": "proceeding_id", 
                           "topic_id": "topic_id",
                           "word_count": len(text.split()), 
                           "speech_text": text,
                           "file_name": file,
                           "mods_file": mods_file}
                    speech_count += 1
                    speeches = speeches.append(row, ignore_index=True)     
                i += 2
            print('finished with file ', speech_count)

CREC-2017-04-06-pt1-PgH2768-5.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2792-20.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-19.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-25.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2793-3.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-31.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2757-7.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-10.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-11.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2757-6.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-30.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-24.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2793-2.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-18.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2792-21.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2768-4.txt
finished with file  4
CREC-2017-04-06-pt1-PgH2768-6.txt
finished with file  5
CREC-2017-04-06-pt1-PgH2789.txt
finish

In [374]:
speeches.head()

Unnamed: 0,speech_id,last_name,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name,mods_file
0,0,BEYER,99999999999999,proceeding_id,topic_id,213,"Mr.Speaker, I rise today to express my constit...",CREC-2017-04-06-pt1-PgH2768-5.txt,CREC-2017-04-06-pt1-PgH2768-5.xml
1,1,RUIZ,99999999999999,proceeding_id,topic_id,200,"Mr.Speaker, Air Force Chief Master Sergeant Sh...",CREC-2017-04-06-pt1-PgH2757-7.txt,CREC-2017-04-06-pt1-PgH2757-7.xml
2,2,STEFANIK,99999999999999,proceeding_id,topic_id,174,"Mr.Speaker, I rise today to praise a school in...",CREC-2017-04-06-pt1-PgH2757-6.txt,CREC-2017-04-06-pt1-PgH2757-6.xml
3,3,MESSER,99999999999999,proceeding_id,topic_id,165,"Mr.Speaker, Congress is leaving for Easter bre...",CREC-2017-04-06-pt1-PgH2768-4.txt,CREC-2017-04-06-pt1-PgH2768-4.xml
4,4,HILL,99999999999999,proceeding_id,topic_id,130,"Mr.Speaker, today I rise to recognize the Quap...",CREC-2017-04-06-pt1-PgH2768-6.txt,CREC-2017-04-06-pt1-PgH2768-6.xml


# Local Mods Parsing

In [425]:
def get_cong_member_tag_from_mods(last_name, mods_file_path):
    handler = open(mods_file_path).read()
    soup = Soup(handler, "lxml")
    cong_member_tags = soup.find_all("congmember")
    matched_cong_member_tag = None
    for i in range(len(cong_member_tags)):
        curr_last_name = get_last_name(cong_member_tags[i])
        if curr_last_name == last_name:
            matched_cong_member_tag = cong_member_tags[i]
            break
    return matched_cong_member_tag

In [426]:
def get_cong_member_info(last_name, mods_file_path):
    matched_cong_member_tag = get_cong_member_tag_from_mods(last_name, mods_file_path)
    if matched_cong_member_tag == None:
        return {'speaker_id': None, 
                'first_name': 'N/A',
                'last_name': last_name,
                'type': 'N/A',
                'chamber': 'N/A',
                'party': 'N/A',
                'state': 'N/A',
                'district': None,
                'bio_guide_id': 'N/A',
                'congress_id': 'N/A'}
    else:
        return {'speaker_id': get_authority_id(matched_cong_member_tag), 
                'first_name': get_first_name(matched_cong_member_tag),
                'last_name': last_name,
                'chamber': get_chamber(matched_cong_member_tag),
                'type': get_type(matched_cong_member_tag),
                'party': get_party(matched_cong_member_tag),
                'state': get_state(matched_cong_member_tag),
                'district': None,
                'bio_guide_id': get_bioguide_id(matched_cong_member_tag),
                'congress_id': get_congress_id(matched_cong_member_tag)}

In [427]:
def get_authority_id_from_mods(last_name, mods_file_path):
    matched_cong_member_tag = get_cong_member_tag_from_mods(last_name, mods_file_path)
    if matched_cong_member_tag == None:
        return 99999999999999
    authority_id = get_authority_id(matched_cong_member_tag)
    if authority_id == None:
        return 99999999999999
    else:
        return authority_id

In [502]:
def populate_speeches(count, folder):
    
    speakersDir = Path('/Users/halliday/projects/searchlight/parsing')
    speakers = pd.read_csv(speakersDir/'updatedspeakers.csv')
    
    speeches = pd.DataFrame(columns=["speech_id",
                                 "last_name",
                                 "speaker_id",
                                 "proceeding_id", 
                                 "topic_id", 
                                 "word_count", 
                                 "speech_text",
                                 "file_name",
                                 "mods_file"])

    def collect_pairs(folder):
        nonlocal speeches
        speech_count = 0
        list_of_files = os.listdir("/Users/halliday/projects/searchlight/parsing/" + folder)
        # list_of_files.remove('.DS_Store')
        for file in list_of_files:
            if file.endswith(".txt"):
                print(file)
                if file == 'CREC-2018-03-22-pt1-PgH1769-2.txt':
                    continue
                if file == 'CREC-2017-09-06-pt1-PgH6695.txt':
                    continue
                mods_file = file.replace('.txt', '.xml')
                separated = sep_speech("/Users/halliday/projects/searchlight/parsing/testparse" + "/" + file)
                i = 0
                while i < len(separated):
                    separated_surname = fix_surname_typos(separated[i])
                    text = separated[i+1]
                    text = text.replace('MrPresident', 'Mr. President')
                    if len(text) > 30:
                        row = {"speech_id": speech_count,
                               "last_name": separated_surname,
                               "speaker_id": 99999999999999,
                               "proceeding_id": "proceeding_id", 
                               "topic_id": "topic_id",
                               "word_count": len(text.split()), 
                               "speech_text": text,
                               "file_name": file,
                               "mods_file": mods_file}
                        speech_count += 1
                        speeches = speeches.append(row, ignore_index=True)     
                    i += 2
                print('finished with file ', speech_count)
    
    collect_pairs(folder)
        
    def get_speaker_id(last_name, mods_file_path):
        nonlocal speakers
        possible_speakers = speakers[speakers['last_name'] == last_name]
        if possible_speakers.shape[0] == 0:
            new_speaker = get_cong_member_info(last_name, mods_file_path)
            speakers = speakers.append(new_speaker, ignore_index=True)
            print(speakers.shape[0])
            speakers = speakers.sort_values('last_name')
            speakers.to_csv('updatedspeakers.csv', index=False)
            print("wrote in new speaker")
            print(new_speaker)
            return new_speaker['speaker_id']
        elif possible_speakers.shape[0] == 1:
            print('used existing row')
            return possible_speakers.iloc[0]['speaker_id']
        else:
            mods_speaker_id = get_authority_id_from_mods(last_name, mods_file_path)
            if int(mods_speaker_id) > 100000:
                print("speaker not found in mods: " + last_name)
                return 99999999999999
            matched_speaker = possible_speakers[possible_speakers['speaker_id'] == int(mods_speaker_id)]
            if matched_speaker.shape[0] == 1:
                print("speaker matched successfully, used existing row")
                return matched_speaker.iloc[0]['speaker_id']
            elif matched_speaker.shape[0] == 0:
                new_speaker = get_cong_member_info(last_name, mods_file_path)
                speakers = speakers.append(new_speaker, ignore_index=True)
                print("wrote in new speaker")
                print(new_speaker)
                speakers = speakers.sort_values('last_name')
                speakers.to_csv('updatedspeakers.csv', index=False)
                return new_speaker['speaker_id']
    
    #initialize new columns
    speeches['proceeding_title'], speeches['year'], speeches['month'], speeches['day'] = "", 0, 0, 0
    
    #collect speaker_ids, titles, dates, and bills
    for i in range(speeches.shape[0]):
        
        curr_row = speeches.iloc[i]
        last_name = curr_row['last_name']
        mods_file_path = "/Users/halliday/projects/searchlight/parsing/testparse" + "/" + curr_row['mods_file']
        text_file_path = "/Users/halliday/projects/searchlight/parsing/testparse" + "/" + curr_row['file_name']
        
        #collect speaker_ids, titles, dates, and bills
        speaker_id = get_speaker_id(last_name, mods_file_path)
        title = find_title(text_file_path)
                    #fix find title
        year, month, day = sep_date_from_file(curr_row['file_name'])
        
        speeches.loc[i, "speaker_id"] = speaker_id
        speeches.loc[i, "proceeding_title"] = title
        speeches.loc[i, "year"] = int(year)
        speeches.loc[i, "month"] = int(month)
        speeches.loc[i, "day"] = int(day)
        
    speeches.to_csv('speeches_' + str(count) + ".csv", index=False)
    print("                " + "saved " + "speeches_" + str(count))
    
    

In [503]:
count = 1
populate_speeches(count, "testparse")

CREC-2017-04-06-pt1-PgH2768-5.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2792-20.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-19.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-25.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2793-3.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2790-31.txt
finished with file  1
CREC-2017-04-06-pt1-PgH2757-7.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-10.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2789-11.txt
finished with file  2
CREC-2017-04-06-pt1-PgH2757-6.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-30.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-24.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2793-2.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2790-18.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2792-21.txt
finished with file  3
CREC-2017-04-06-pt1-PgH2768-4.txt
finished with file  4
CREC-2017-04-06-pt1-PgH2768-6.txt
finished with file  5
CREC-2017-04-06-pt1-PgH2789.txt
finish

CREC-2017-04-07-pt1-PgS2435-4.txt
finished with file  69
CREC-2017-04-06-pt1-PgH2779.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2792.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2792-10.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2790-15.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2790-29.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2782-7.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2782-6.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2790-28.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2790-14.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2792-11.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2793.txt
finished with file  70
CREC-2017-04-06-pt1-PgH2778.txt
finished with file  72
CREC-2017-04-07-pt1-PgS2435-5.txt
finished with file  72
used existing row
used existing row
used existing row
used existing row
speaker matched successfully, used existing row
speaker matched successfully, used existing row
used existing row
used existing row


In [491]:
speeches

Unnamed: 0,speech_id,last_name,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name,mods_file,proceeding_title,year,month,day
0,0,BEYER,2272,proceeding_id,topic_id,213,"Mr.Speaker, I rise today to express my constit...",CREC-2017-04-06-pt1-PgH2768-5.txt,CREC-2017-04-06-pt1-PgH2768-5.xml,1045,2017,4,6
1,1,RUIZ,2109,proceeding_id,topic_id,200,"Mr.Speaker, Air Force Chief Master Sergeant Sh...",CREC-2017-04-06-pt1-PgH2757-7.txt,CREC-2017-04-06-pt1-PgH2757-7.xml,HONORING THE MEMORY OF CHIEF MASTER SERGEANT S...,2017,4,6
2,2,STEFANIK,2263,proceeding_id,topic_id,174,"Mr.Speaker, I rise today to praise a school in...",CREC-2017-04-06-pt1-PgH2757-6.txt,CREC-2017-04-06-pt1-PgH2757-6.xml,HONORING PAUL SMITH'S COLLEGE FOR THEIR COMMIT...,2017,4,6
3,3,MESSER,2130,proceeding_id,topic_id,165,"Mr.Speaker, Congress is leaving for Easter bre...",CREC-2017-04-06-pt1-PgH2768-4.txt,CREC-2017-04-06-pt1-PgH2768-4.xml,OUR WORK ON HEALTH CARE IS UNDONE,2017,4,6
4,4,HILL,2223,proceeding_id,topic_id,130,"Mr.Speaker, today I rise to recognize the Quap...",CREC-2017-04-06-pt1-PgH2768-6.txt,CREC-2017-04-06-pt1-PgH2768-6.xml,QUAPAW AREA COUNCIL GOLD LEVEL,2017,4,6
5,5,SESSIONS,1525,proceeding_id,topic_id,78,"Mr.Speaker, pursuant to the order of the House...",CREC-2017-04-06-pt1-PgH2782-9.txt,CREC-2017-04-06-pt1-PgH2782-9.xml,ADJOURNMENT,2017,4,6
6,6,RUSSELL,2265,proceeding_id,topic_id,2090,"Mr.Speaker, today, exactly 100 years ago, on t...",CREC-2017-04-06-pt1-PgH2777.txt,CREC-2017-04-06-pt1-PgH2777.xml,REMEMBERING WORLD WAR I,2017,4,6
7,7,GARAMENDI,1973,proceeding_id,topic_id,84,"Mr.Speaker, I remember well the investigations...",CREC-2017-04-06-pt1-PgH2768-7.txt,CREC-2017-04-06-pt1-PgH2768-7.xml,INVESTIGATE RUSSIA'S INVOLVEMENT IN OUR NATION...,2017,4,6
8,8,PANETTA,2309,proceeding_id,topic_id,182,"Mr.Speaker, in the early morning hours on a da...",CREC-2017-04-06-pt1-PgH2768-3.txt,CREC-2017-04-06-pt1-PgH2768-3.xml,ATROCIOUS CHEMICAL WEAPONS ATTACK,2017,4,6
9,9,GARRETT,1737,proceeding_id,topic_id,130,"Mr.Speaker, I ask that you join me today as we...",CREC-2017-04-06-pt1-PgH2768-2.txt,CREC-2017-04-06-pt1-PgH2768-2.xml,MOMENT OF SILENCE IN HONOR OF JAIDEN BARTEE AN...,2017,4,6


In [437]:
speakers = pd.read_csv(speakersDir/'updatedspeakers.csv')
speakers[speakers['last_name'] == 'KIHUEN']

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id
584,2340,Ruben,KIHUEN,HOUSE,,D,NV,,K000390,115


In [438]:
speakers[speakers['last_name'] == 'BACON']

Unnamed: 0,speaker_id,first_name,last_name,chamber,type,party,state,district,bio_guide_id,congress_id
32,2337,Don,BACON,HOUSE,,R,NE,,B001298,115


In [439]:
speeches.dtypes

speech_id        object
last_name        object
speaker_id       object
proceeding_id    object
topic_id         object
word_count       object
speech_text      object
file_name        object
mods_file        object
dtype: object

In [440]:
speakers.dtypes

speaker_id        int64
first_name       object
last_name        object
chamber          object
type             object
party            object
state            object
district        float64
bio_guide_id     object
congress_id       int64
dtype: object

In [463]:
df = pd.DataFrame(columns=['a', 'b'])

In [466]:
df = df.append({'a': 0, 'b': 1}, ignore_index=True)

In [469]:
df['c'] = [0 for i in range(3)]

In [471]:
df['d'], df['e'] = 1, 2

In [475]:
df.loc[0,'a'] = 4

In [476]:
df

Unnamed: 0,a,b,c,d,e
0,4,1,0,1,2
1,0,1,0,1,2
2,0,1,0,1,2
