In [1]:
import pandas as pd
df_train = pd.read_csv('../data/mytrain.csv')
df_test = pd.read_csv('../data/mytest.csv')

In [None]:
# small
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

# medium
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'

# original
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

### Since only around 22 sentences are longer than 100 words, remove them for computational efficiency

In [None]:
# use batch_to_ids to convert sentences to character ids
sentences = df_train['question_text'].values
# remove newline character
s_removed_newline = [e.replace('\n','\\n') for e in sentences]
df_train['removed_newline'] = s_removed_newline
# tokenize the sentences; this step may take less than half of a minute
import re
tokenized_sen = [re.findall(r"\w+|[^\w\s]", text, re.UNICODE) for text in s_removed_newline]
df_train['tokenized_sen'] = tokenized_sen
token_len = [len(e) for e in tokenized_sen]
df_train['token_len'] = token_len

### Tokenized sentences longer than 100  --> latex+math

In [None]:
df_train[df_train['token_len']>100]['question_text']

### Remove newline for the time being

In [None]:
is_newline = [1 if '\n' in e else 0 for e in sentences] # sum(is_newline) == 6, i.e., there are 6 sentences containing \n
df_train = df_train[~df_train.question_text.str.contains('\n')]

### Remove them for the time being

In [None]:
df_train_filtered = df_train[df_train['token_len']<=100]

### Instead of embedding them via Python, use elmo software directly

In [None]:
df_train_filtered.to_csv('se100_newlineremoved_text')

### Read filtered text and save into file

In [None]:
df = pd.read_csv('se100_newlineremoved_text')
df.question_text.values

In [None]:
from allennlp.modules.elmo import Elmo, batch_to_ids


elmo = Elmo(options_file, weight_file, 1, dropout=0)

# convert sentences into char ids (batch_size, max_sentence_len, 50), the input should be list of list[tokens]
character_ids = batch_to_ids(list(df_train['tokenized_sen'].values)[:10])

embeddings = elmo(character_ids)

### Running embeddings via ELMO

`allennlp elmo filtered_text elmo_layers.hdf5 --top --options-file /u/shawnlyu/projects/linguistics/downloads/elmo_2x2048_256_2048cnn_1xhighway_options.json --weight-file /u/shawnlyu/projects/linguistics/downloads/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5`