In [1]:
import numpy as np
import re

In [2]:
data_path = 'dialogs.txt'

In [3]:
with open(data_path, 'r', encoding = 'utf-8') as file:
    lines = file.read().split('\n')

In [4]:
input_docs = []
target_docs = []

input_tokens = set()
target_tokens = set()

In [5]:
for line in lines:
    input_doc, target_doc = line.split('\t')
    input_docs.append(input_doc)
    
    target_doc = " ".join(re.findall(r"[\w']+|['^\w\s]", target_doc))
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
    
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)
    

In [6]:
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

In [7]:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

In [8]:
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\w\s]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\w\s]", target_doc)) for target_doc in target_docs])

In [9]:
input_features_dict = dict(
    [(token, index) for index, token in enumerate(input_tokens)]
)

target_features_dict = dict(
    [(token, index) for index, token in enumerate(target_tokens)]
)

In [10]:
reverse_input_features_dict = dict(
    [(index, token) for token, index in input_features_dict.items()]
)

reverse_target_features_dict = dict(
    [(index, token ) for token, index  in target_features_dict.items()]
)

In [11]:
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32'
)

decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens), 
    dtype='float32'
)

decoder_target_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_decoder_tokens)
)

In [12]:
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
      
    encoder_input_data[line, timestep, input_features_dict[token]] = 1.

  for timestep, token in enumerate(target_doc.split()):

    decoder_input_data[line, timestep, target_features_dict[token]] = 1.
    if timestep > 0:

      decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.


In [13]:

# print out those value here:
print
print(list(input_features_dict.items())[450:500])
# print(list(input_features_dict.values())[450:500])
print(list(reverse_target_features_dict.items())[450:500])
# print(list(reverse_target_features_dict.values())[40:90])
print(list(input_tokens)[1000:1200])

[('cloth', 450), ('club', 451), ('coat', 452), ('code', 453), ('coffee', 454), ('coffin', 455), ('cold', 456), ('colder', 457), ('college', 458), ('collision', 459), ('color', 460), ('come', 461), ('comes', 462), ('comfortable', 463), ('coming', 464), ('commercials', 465), ('common', 466), ('company', 467), ('complain', 468), ('complained', 469), ('complainers', 470), ('complaining', 471), ('completely', 472), ('computer', 473), ('computers', 474), ('concrete', 475), ('conditioner', 476), ('cone', 477), ('cones', 478), ('conference', 479), ('congratulations', 480), ('considering', 481), ('constantly', 482), ('continued', 483), ('controls', 484), ('conversing', 485), ('converter', 486), ('cook', 487), ('cooking', 488), ("cooks'", 489), ('cool', 490), ('cop', 491), ('cops', 492), ('corner', 493), ('corporations', 494), ('correct', 495), ('cost', 496), ('costs', 497), ('coughing', 498), ('could', 499)]
[(450, 'climbing'), (451, 'clock'), (452, 'close'), (453, 'closed'), (454, 'closer'), (