# Scratch book to try out different functions

## 1. Read lines in move_lines to build a dict of lines

In [None]:
from data import TextReader
from word2vec import word2vec

In [None]:
file_name = "./Data/cornell_data/movie_lines.txt"
reader = TextReader()
reader.read_line_dict(file_name)

In [None]:
# The first line stored in movie_lines.txt
print(reader.lines[1045])
# Find the longest line (used in seq2seq for attention)
splitter = word2vec()
max_length = 0
max_line   = None
max_split  = None
for key in reader.lines.keys():
    line = reader.lines[key]
    length = len(list(splitter.sentence_to_list(line)))
    if length > max_length:
        max_length = length
        max_line   = line
        max_split  = splitter.sentence_to_list(line)

print((max_length, max_line, max_split))

## 2. Get vocabulary

In [None]:
from collections import defaultdict
vocab = defaultdict(int)
for index in reader.lines.keys():
    line = reader.lines[index]
    words = line.split()
    for word in words:
        vocab[word] += 1

print(len(vocab))
print(vocab[b'They'])

## 3. Read dialogues

In [None]:
# Default file path is movie_conversations
reader.read_dialogues()
print(reader.dialogues[0])

## 4. Sanity Test on Word2Vec Prep

In [None]:
from data import TextReader
from word2vec import word2vec

In [None]:
# Sanity test on training
reader = TextReader()
reader.read_line_dict("./Data/cornell_data/shortened_lines.txt")

word_model = word2vec()
word_model.add_whole_corpus(reader)
word_model.generate_indices()

In [None]:
word_model.train(reader, epochs=1)
word_model.merge_embeddings()
word_model.save_embedding("word_embedding")

In [None]:
word_model.load_embedding("word_embedding")

## 5. Sanity Test for EncoderRNN

In [1]:
from seq2seq import EncoderRNN, AttnDecoderRNN

Using CUDA
Using CUDA for Seq2Seq


In [2]:
import torch

HIDDEN_SIZE = 5
INPUT_SIZE  = 12
BATCH_SIZE  = 10


encoder = EncoderRNN(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE)
decoder = AttnDecoderRNN(hidden_size=HIDDEN_SIZE, output_size=INPUT_SIZE, batch_size=BATCH_SIZE)

# sequence length 5
input_tensor = torch.randn(5, BATCH_SIZE, INPUT_SIZE)
output = encoder(input_tensor, encoder.initHidden(batch_size=BATCH_SIZE))

In [3]:
decoding_start = torch.randn(1, BATCH_SIZE, INPUT_SIZE)
decoded = decoder.forward(decoding_start, decoder.initHidden(), output[0])

In [4]:
print(decoded)

(tensor([[-2.4986, -2.7907, -2.2938, -2.2136, -2.7745, -2.5925, -2.6909, -2.6268,
         -2.1018, -2.6232, -2.5195, -2.3690],
        [-2.5788, -2.7653, -2.3102, -2.3375, -2.8125, -2.5603, -2.5890, -2.5787,
         -2.0528, -2.6005, -2.4524, -2.4263],
        [-2.5695, -2.7711, -2.3112, -2.3230, -2.8060, -2.5636, -2.6005, -2.5809,
         -2.0564, -2.6024, -2.4600, -2.4207],
        [-2.5339, -2.7894, -2.3199, -2.2594, -2.7971, -2.5826, -2.6340, -2.5877,
         -2.0659, -2.5890, -2.4871, -2.4304],
        [-2.5752, -2.7189, -2.3072, -2.3212, -2.8537, -2.5738, -2.5613, -2.5821,
         -2.0614, -2.5451, -2.4526, -2.5047],
        [-2.5597, -2.7779, -2.3144, -2.3125, -2.7972, -2.5640, -2.6131, -2.5808,
         -2.0596, -2.6071, -2.4674, -2.4123],
        [-2.5618, -2.7771, -2.3156, -2.3169, -2.7978, -2.5624, -2.6104, -2.5788,
         -2.0582, -2.6069, -2.4656, -2.4136],
        [-2.5312, -2.7828, -2.2967, -2.2657, -2.7874, -2.5771, -2.6526, -2.6107,
         -2.0820, -2.6220, -2