## Convert bill and legislator to indices and construct embedding matrix

In [15]:
import numpy as np
import pickle as pkl
import pandas as pd

In [16]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return word_to_vec_map

word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [20]:
def bill_to_index(data1,data2,data3,max_len=400):
    data=pd.concat([data1[["natural_id","clip_summary"]],data2[["natural_id","clip_summary"]],data3[["natural_id","clip_summary"]]])
    data.drop_duplicates(subset ="natural_id",keep = 'first', inplace = True) 
    bill2idx={}
    i=0
    word2idx={}
    word2count={}
    idx2word={}
    for index,row in data.iterrows():
        text=row.clip_summary
        text2idx=np.zeros(max_len) #zero padding automatically taken care of by initializing with 0
        j = 0
        for word in text:
            if word in word2idx:
                text2idx[j]= word2idx[word]
                word2count[word]=word2count[word]+1
            else:
                i=i+1
                word2count[word]=1
                word2idx[word]=i
                idx2word[i]=word
                text2idx[j]=i
            j=j+1   
        bill2idx[row["natural_id"]]=text2idx
    return bill2idx,word2idx,idx2word,word2count

In [25]:
with (open("data/full_data_20052012_processed.pkl","rb")) as file1:
    data1=pkl.load(file1)
with (open("data/full_data_20132014_processed.pkl","rb")) as file2:
    data2=pkl.load(file2)
with (open("data/full_data_20152016_processed.pkl","rb")) as file3:
    data3=pkl.load(file3)

data1 = data1.loc[:,~data1.columns.duplicated()]
data3 = data3.loc[:,~data3.columns.duplicated()]
data2 = data2.loc[:,~data2.columns.duplicated()]

In [26]:
# Filter null summary
data2=data2[data2['summary'].notnull()]
data3=data3[data3['summary'].notnull()]

#map legislator id to unique indices 
leg_ids=pd.concat([data1["leg_id"],data2["leg_id"],data3["leg_id"]]).unique()
leg_to_index={}
i=0
for leg_id in leg_ids:
    leg_to_index[leg_id]=i
    i=i+1
data1["leg_index"] = data1.leg_id.apply(lambda x: leg_to_index[x])
data2["leg_index"] = data2.leg_id.apply(lambda x: leg_to_index[x])
data3["leg_index"] = data3.leg_id.apply(lambda x: leg_to_index[x])

#keep only those leg_id in test set that are present in training set
data2=data2[data2.leg_index.isin(data1.leg_index)]
data3=data3[data3.leg_index.isin(data1.leg_index)]

#map text words to indices
bill2idx, word2idx, idx2word,word2count = bill_to_index(data1,data2,data3,400)
data1=pd.merge(data1,pd.DataFrame(bill2idx.items(),columns=["natural_id","bill2idx"]),on="natural_id")
data2=pd.merge(data2,pd.DataFrame(bill2idx.items(),columns=["natural_id","bill2idx"]),on="natural_id")
data3=pd.merge(data3,pd.DataFrame(bill2idx.items(),columns=["natural_id","bill2idx"]),on="natural_id")

In [32]:
data1=data1[["natural_id","bill_type","chamber","pip","d_perc","i_perc","r_perc","total_sponsors","summary","party","vote","leg_index","bill2idx"]]
data2=data2[["natural_id","bill_type","chamber","pip","d_perc","i_perc","r_perc","total_sponsors","summary","party","vote","leg_index","bill2idx"]]
data3=data3[["natural_id","bill_type","chamber","pip","d_perc","i_perc","r_perc","total_sponsors","summary","party","vote","leg_index","bill2idx"]]
emb_matrix = np.zeros((len(word2idx) + 1,50))

for word, index in word2idx.items():
    if word in word_to_vec_map:
        emb_matrix[index, :] = word_to_vec_map[word]
    else:
        emb_matrix[index,:]=np.random.random(50)

In [33]:
with (open("data_Final_20052012.pkl", "wb")) as openfile:
    pkl.dump(data1,openfile)
with (open("data_Final_20132014.pkl", "wb")) as openfile:
    pkl.dump(data2,openfile)
with (open("data_Final_20152016.pkl", "wb")) as openfile:
    pkl.dump(data3,openfile)
with (open("dictionaries.pkl", "wb")) as openfile:
    pkl.dump([word2idx,idx2word,word_to_vec_map,leg_to_index],openfile)

with (open("emb_matrix.pkl", "wb")) as openfile:
    pkl.dump(emb_matrix,openfile)