# Training an RMN on all session documents

In [1]:
import os
import pandas as pd

In [2]:
os.chdir("../../../scripts/assembly/")

In [3]:
from document import load_documents
from constant import DOC_ALL_PATH, TOOLS_PATH, MIN_SESSION, MAX_SESSION

In [4]:
# run several times
os.chdir("../modeling")
from helper import load_pickled_object
from rmn import RMN

In [5]:
# load embedding tools
local_tools_path = '/home/rocassius/gen-data/tools'
tokenizer_dict = load_pickled_object(os.path.join(local_tools_path, "global_tokenizer_dict"))
metadata_dict = load_pickled_object(os.path.join(local_tools_path, "global_metadata_dict"))
embedding_matrix = load_pickled_object(os.path.join(local_tools_path, "global_embedding_matrix_50d"))

In [6]:
# load documents
docs_df = pd.read_csv(os.path.join(DOC_ALL_PATH, "documents_total.csv"), sep = "|")

In [113]:
d = docs_df.sample(8231)

In [8]:
rmn = RMN()
rmn.embedding_matrix = embedding_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.metadata_dict = metadata_dict
rmn.num_topics = 100
rmn.lamb = 1.0
rmn.build_model()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [152]:
# from keras.utils import Sequence
from tensorflow.python.keras.utils.data_utils import Sequence

class RMN_DataGenerator(Sequence):
    """Generates data for an RMN"""
    
    def __init__(self, rmn, data_df, batch_size=50):
        
        'Initialization'
        self.rmn = rmn
        self.data_df = data_df
        self.batch_size = batch_size
        self.indices = data_df.index.tolist()
        
        # shuffle indicies upon initialization
        np.random.shuffle(self.indices)
        
        self.on_epoch_end()
        
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        np.random.shuffle(self.indices)
         
            
    def __data_generation(self, indices):
        'Generates data containing batch_size samples' 
        # generate data for indices
        X = self.rmn.prep_inputs(self.data_df.loc[indices])
        y = X[0]

        return X, y
    
    
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.indices) / self.batch_size))


    def __getitem__(self, i):
        'Generate one batch of data'
        # Generate indexes of the batch
        indices = self.indices[i * self.batch_size:(i+1) * self.batch_size]

        # Generate data
        X, y = self.__data_generation(indices)

        return X, y

In [153]:
data_generator = RMN_DataGenerator(data_df=d,
                                   rmn=rmn, 
                                   batch_size=100)

In [154]:
rmn.model.fit_generator(data_generator, epochs=5, use_multiprocessing=True, workers=5)

Epoch 1/5
Epoch 2/5
Epoch 1/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5a00ba6150>

In [117]:
rmn.build_topic_model()

In [None]:
rmn.model.predict

In [128]:
docs_df.shape

(6599686, 10)

In [146]:
d_samp = docs_df.sample(1000000)

In [155]:
BS = 100
dg = RMN_DataGenerator(data_df=d_samp, rmn=rmn, batch_size=BS)

In [156]:
import time 
start = time.time()
rmn.topic_model.predict_generator(dg, use_multiprocessing=True, workers=20)
end = time.time()
elapsed = end - start
print(round(elapsed/60, 3), "MINUTES")

0.63 MINUTES


In [149]:
import time 
start = time.time()
rmn.topic_model.predict_generator(dg, use_multiprocessing=True, workers=10)
end = time.time()
elapsed = end - start
print(round(elapsed/60, 3), "MINUTES")

0.542 MINUTES


In [148]:
import time 
start = time.time()
rmn.topic_model.predict_generator(dg, use_multiprocessing=True, workers=5)
end = time.time()
elapsed = end - start
print(round(elapsed/60, 3), "MINUTES")

0.636 MINUTES


In [139]:
import time 
start = time.time()
rmn.topic_model.predict_generator(dg, use_multiprocessing=True)
end = time.time()
elapsed = end - start
print(round(elapsed/60, 3), "MINUTES")

0.337 MINUTES


In [136]:
import time 
start = time.time()
rmn.predict_topics(d_samp)
end = time.time()
elapsed = end - start
print(round(elapsed/60, 3), "MINUTES")

0.403 SECONDS


In [118]:
rmn.topic_model.predict_generator(d)

array([[0.00964151, 0.00794159, 0.0080036 , ..., 0.01449753, 0.00748336,
        0.00985585],
       [0.01065519, 0.00841128, 0.00978033, ..., 0.01299322, 0.00835345,
        0.00929048],
       [0.0099263 , 0.00778961, 0.00782576, ..., 0.016422  , 0.0082397 ,
        0.00919981],
       ...,
       [0.01288762, 0.00774932, 0.00858825, ..., 0.0129737 , 0.00898047,
        0.00958539],
       [0.01265937, 0.00752051, 0.00768751, ..., 0.0152425 , 0.00959403,
        0.00908329],
       [0.01072115, 0.00845621, 0.00909524, ..., 0.01357337, 0.00882754,
        0.01090457]], dtype=float32)

In [40]:
d.loc[[1817194]]

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
1817194,82100361,AIKEN,GEORGE,S,VT,M,R,appliances at prices either below cost or just...,defense,82


In [41]:
d.iloc[list_IDs_temp]

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
1817194,82100361,AIKEN,GEORGE,S,VT,M,R,appliances at prices either below cost or just...,defense,82
2410324,87100320,YOUNGER,JESSE,H,CA,M,R,in cuba and in laos w continue to be concerned...,labor,87
3254612,92110800,SMITH,NEAL,H,IA,M,D,most exciting programs involves the use of a n...,minorities,92
1817194,82100361,AIKEN,GEORGE,S,VT,M,R,appliances at prices either below cost or just...,defense,82
2957006,91099231,HOLLAND,SPESSARD,S,FL,M,D,the from made some points about the fact that ...,defense,91


In [15]:
docs_df['speakerid'] = docs_df['speakerid'].astype(str)
docs_df['session'] = docs_df['session'].astype(str)

In [17]:
X = rmn.prep_inputs(d)
y = X[0]

In [22]:
import numpy as np
np.array(X).shape

ValueError: could not broadcast input array from shape (75,50) into shape (75)

In [19]:
y.shape

(75, 50)

In [None]:
rmn.model.fit(x=X, y=y, epochs = 7, batch_size = 200)