In [1]:
import os
import pandas as pd

In [2]:
os.chdir("../../../scripts/assembly/")

In [9]:
from document import load_documents
from constant import DOC_ALL_PATH, DOCUMENT, MIN_SESSION, MAX_SESSION
sessions = list(range(MIN_SESSION, MAX_SESSION+1))

In [7]:
os.chdir("../modeling/")

from token_mapping import *
from helper import pickle_object
from embeddings import *

In [10]:
docs_df = load_documents(sessions, DOC_ALL_PATH)

In [11]:
docs_df.shape

(6599686, 10)

In [12]:
docs_df.head()

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,43042541,DENNIS,GEORGE,S,MD,M,D,the surplus and discourage their production in...,alcohol,43
1,43042101,WRIGHT,GEORGE,S,IA,M,R,to me a good feature of the bill that the same...,alcohol,43
2,43042580,ELDREDGE,CHARLES,H,WI,M,D,social evils which must be remitted to the dom...,alcohol,43
3,43041181,FRELINGHUYSEN,FREDERICK,S,NJ,M,R,they have accepted five dollars and they must ...,alcohol,43
4,43050520,POLAND,LUKE,H,VT,M,R,the relief of foster a hixon late a paymaster ...,alcohol,43


In [17]:
docs_df.dtypes

speakerid    object
lastname     object
firstname    object
chamber      object
state        object
gender       object
party        object
document     object
subject      object
session      object
dtype: object

In [13]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'session',
 'subject']


In [16]:
assert all([c in docs_df.columns for c in feature_columns])

In [18]:
# make tokenizer and metadata dicts
tokenizer_dict = build_tokenizer_dict(docs_df)

In [19]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)
metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'session', 'subject'])

In [20]:
os.getcwd()

'/home/rocassius/w266_final/scripts/assembly'

In [21]:
pickle_object(tokenizer_dict, "global_tokenizer_dict")

In [22]:
pickle_object(metadata_dict, "global_metadata_dict")

In [25]:
embeddings_index = fetch_embeddings()

In [26]:
example_embeddings_index = {
    'the': [1,2,3],
    'my':  [2,3,-1],
    'constitution': [0,4,5]
}

In [27]:
def build_embedding_matrix(word_index, embeddings_index, stopwords=[]):
    
    # get the embedding dimension
    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and word not in stopwords:
            # words not found in embedding index and stopwords will be all-zeros.
            embedding_matrix[i] = embedding_vector 
            
    embedding_matrix = embedding_matrix.astype('float16')
    
    return embedding_matrix

In [39]:
from preprocess import *

In [58]:
stopwords = english_stopwords + name_stopwords

In [59]:
embedding_matrix = build_embedding_matrix(
    tokenizer_dict['word_index'], 
    embeddings_index, 
    stopwords)

In [60]:
embedding_matrix.shape

(1628003, 50)

In [66]:
len(embeddings_index)

400000

about 150,000 tokens are represented. the glove embebdding index represents 400,000 tokens

In [67]:
np.sum(np.sum(embedding_matrix, axis=1) != 0.0)

156275

Manual check that certain words are represented and others arent

In [91]:
sample_words = ['constitutional', 
                'blue', 
                'mom', 
                'the', 
                'war', 
                'with', 
                'just', 
                'gesticulate', 
                'civil', 
                'pelosi', 
                'nancy',
                'blacks',
                'cash',
                'economy',
                'wasnt',
                'female'
               ]

for w in sample_words:
    print(20*"=")
    print(w)
    print(embedding_matrix[tokenizer_dict['word_index'][w]])

constitutional
[-0.617   -0.1227  -0.919    0.04483  0.2021   0.7324   0.919   -0.02875
 -0.1716  -0.3481  -0.3762  -0.0846   0.04965  0.2974   0.092    0.3423
  0.08093 -1.232    0.893   -0.08044  0.1415   0.1912  -0.5513  -0.723
  0.1267  -1.98     0.1202  -0.9253  -0.4568   0.725    2.309   -0.639
 -2.291   -0.988   -0.317   -0.5493   0.502   -0.01098 -0.2445   0.2715
 -0.9604   0.0947   0.263    0.986   -1.787    0.9033  -0.559    0.3342
  0.0872  -0.6    ]
blue
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
mom
[-0.2659    0.5186    0.0656   -0.5605    0.7656    0.3455   -0.7646
  0.1807   -0.007324  0.07104  -0.3833    0.2467    0.0745    0.1843
  1.054     0.2292   -0.615     0.7207    0.9834    0.07794  -0.0778
  1.066     0.071     1.29      0.863    -1.42     -0.7812    0.2115
  1.086    -1.152     1.355     0.9326   -0.0638    0.707     0.01733
  0.129    -0.01277   0.0

In [89]:
# make one embedding matrix that does not have zeroed stopwords
embedding_matrix_nz = build_embedding_matrix(
    tokenizer_dict['word_index'], 
    embeddings_index)

In [93]:
for w in sample_words:
    print(20*"=")
    print(w)
    print(embedding_matrix_nz[tokenizer_dict['word_index'][w]])

constitutional
[-0.617   -0.1227  -0.919    0.04483  0.2021   0.7324   0.919   -0.02875
 -0.1716  -0.3481  -0.3762  -0.0846   0.04965  0.2974   0.092    0.3423
  0.08093 -1.232    0.893   -0.08044  0.1415   0.1912  -0.5513  -0.723
  0.1267  -1.98     0.1202  -0.9253  -0.4568   0.725    2.309   -0.639
 -2.291   -0.988   -0.317   -0.5493   0.502   -0.01098 -0.2445   0.2715
 -0.9604   0.0947   0.263    0.986   -1.787    0.9033  -0.559    0.3342
  0.0872  -0.6    ]
blue
[-0.8374    0.696    -0.514     0.2369    0.592    -0.0275   -1.208
 -0.988    -0.2766   -0.462     0.4714    0.1307    0.504     0.5054
 -0.6675    0.0691   -0.61     -0.2278   -1.248    -1.353    -0.5605
 -0.1796    0.2289   -0.6924   -1.174    -0.988    -0.8154    1.552
  0.3652   -1.116     2.633     0.2198    0.10693   0.2844   -0.10345
 -0.2966   -0.1764   -0.7583    0.0855   -0.8364   -0.12177  -0.0632
 -0.0721   -0.3071    0.6187   -0.3086    0.012375 -1.196     0.04153
 -0.2396  ]
mom
[-0.2659    0.5186    0.0656  

In [94]:
pickle_object(embedding_matrix_nz, "global_embedding_matrix_nz_50d")

In [86]:
from helper import load_pickled_object
e = load_pickled_object("global_embedding_matrix_50d")

In [87]:
e

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [95]:
os.getcwd()

'/home/rocassius/w266_final/scripts/assembly'