In [None]:
import numpy as np
from types import SimpleNamespace
from collections import Counter
import os
import re
import pathlib
import array
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import math

In [None]:
f = open('dataset.txt', 'r')
readfile = f.readlines()
dataset = [x for x in readfile if '\n' != x or '<EOE>' != x]
dataset = np.array(dataset)
print(dataset)

['Michael: All right Jim. Your quarterlies look very good. How are things at the library?\n'
 '\n' "Jim: Oh, I told you. I couldn't close it. So...\n" ...
 "Pam: I thought it was weird when you picked us to make a documentary. But all in all...I think an ordinary paper company like Dunder Mifflin was a great subject for a documentary. There's a lot of beauty in ordinary things. Isn't that kind of the point?\n"
 '\n' '<EOE>']


In [None]:
dataset.shape

(110460,)

In [None]:
def tokenize(dataset):
  """
  :param dataset: array of sentences delimited by /n.
  :return: array of sentences with the symbols tokenized.
  """
  punct={
          '.':" ||period|| ",
          ",":" ||comma|| ",
          '"':" ||doublequotationmark|| ",
          "'":" ||quotationmark|| ",
          "?":" ||questionmark|| ",
          "-":" ||dash|| ",
          "\n":" ||return||",
          ")":" ||rightparentheses|| ",
          "(":" ||leftparentheses|| ",
          "[":" ||leftbracket|| ",
          "]":" ||rightbracket|| ",
          ";":" ||semicolon|| ",
          "!":" ||exclamationmark|| ",
          ":": " ||colon|| ",
          "%":" ||percent|| ",
          "$":" ||dollar|| ",
          "#":" ||hashtag|| ",
          "/":" ||forwardbar|| "
      }

  new_dataset = []
  for sentence in dataset:
    sentence = sentence.replace('.', punct['.'])
    sentence = sentence.replace(',', punct[','])
    sentence = sentence.replace('"', punct['"'])
    sentence = sentence.replace("'", punct["'"])
    sentence = sentence.replace('?', punct['?'])
    sentence = sentence.replace('-', punct['-'])
    sentence = sentence.replace('\n', punct['\n'])
    sentence = sentence.replace(')', punct[')'])
    sentence = sentence.replace('(', punct['('])
    sentence = sentence.replace('[', punct['['])
    sentence = sentence.replace(']', punct[']'])
    sentence = sentence.replace(';', punct[';'])
    sentence = sentence.replace('!', punct['!'])
    sentence = sentence.replace(':', punct[':'])
    sentence = sentence.replace('%', punct['%'])
    sentence = sentence.replace('$', punct['$'])
    sentence = sentence.replace('#', punct['#'])
    sentence = sentence.replace('/', punct['/'])
    new_dataset += sentence.split(" ")
  return np.array(new_dataset)

In [None]:
def vocab(text):
  """
  :param text: text split into sentences delimited by /n.
  :return: an np array with unique strings, the vocabulary of the dataset.
  """
  vocab = np.unique(text)
  return vocab

In [None]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    word_counts=Counter(text)
    sorted_vocab=sorted(word_counts,key=word_counts.get,reverse=True)

    texti=set(text)
    vocab_to_int={word:ii for ii,word in enumerate(texti,0)}
    int_to_vocab={items:key for key,items in vocab_to_int.items()}

    return (vocab_to_int, int_to_vocab)

In [None]:
def text_to_int(text, vocab_to_int):
  s = pd.Series(text)
  s = s.map(vocab_to_int)
  return s.to_numpy()

In [None]:
def int_to_text(int_text, int_to_vocab):
  s = pd.Series(int_text)
  s = s.map(int_to_vocab)
  return s.to_numpy()

In [None]:
def reduce_mem_usage(train_data):
    col_type = train_data[0].dtype

    if col_type != object:
        c_min = train_data.min()
        c_max = train_data.max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                train_data = train_data.astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                train_data = train_data.astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                train_data = train_data.astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                train_data = train_data.astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                train_data = train_data.astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                train_data = train_data.astype(np.float32)
            else:
                train_data = train_data.astype(np.float64)
    else:
        train_data = train_data.astype('category')
    print(train_data[0].dtype)
    return train_data

In [None]:
tokenized_text = tokenize(dataset)
vocabulary = vocab(tokenized_text)
vocab_to_int, int_to_vocab = create_lookup_tables(vocabulary)
int_text = text_to_int(tokenized_text, vocab_to_int)
#int_text = reduce_mem_usage(int_text)

del dataset
del tokenized_text

In [None]:
print(int_text.shape)

(1400061,)


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("WARNING: Training without GPU can be very slow!")

In [None]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    feature_tensors = []
    target_tensors = []
    for i in range(len(words)):
      target_idx = i + sequence_length
      if target_idx < len(words):
        features = words[i:i + sequence_length]
        feature_tensors.append(features)

        target = words[target_idx]
        target_tensors.append(target)
        
    data = TensorDataset(torch.from_numpy(np.array(feature_tensors)).to(device), torch.from_numpy(np.array(target_tensors)).to(device))
    data_loader = torch.utils.data.DataLoader(data, shuffle=False, batch_size=batch_size)
    # return a dataloader
      
    return data_loader

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
''' Data params '''
# Sequence Length
sequence_length = 12   # of words in a sequence
# Batch Size
batch_size = 400

# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)

'''Training parameters'''
# Number of Epochs
num_epochs = 4
# Learning Rate
learning_rate = 0.0008

'''Model parameters'''
# Vocab size
vocab_size = len(vocabulary)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 200

# Show stats for every n number of batches
show_every_n_batches = 20
print(vocab_size)
print(len(train_loader))

24531
3501


In [None]:
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, bias=True):
        super().__init__()
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.reset_parameters()

    def reset_parameters(self):
        # Empirically observed the convergence to be much better with the scaled initialization
        nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
        nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
        nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.out_proj.bias is not None:
            nn.init.constant_(self.out_proj.bias, 0.)

    # B = Batch size
    # W = Number of context words (left + right)
    # E = embedding_dim
    def forward(self, x):
        # x shape is (B, W, E)
        q = self.q_proj(x)
        # q shape is (B, W, E)
        k = self.k_proj(x)
        # k shape is (B, W, E)
        v = self.v_proj(x)
        # k shape is (B, W, E)
        y, _ = attention(q, k, v)
        # y shape is (B, W, E)
        y = self.out_proj(y)
        # y shape is (B, W, E)
        return y


In [None]:
class TransformerLayer(nn.Module):
    def __init__(self, d_model, dim_feedforward=512, dropout=0.1, activation="relu"):
        super().__init__()
        self.self_attn1 = SelfAttention(d_model)
        self.self_attn2 = SelfAttention(d_model)
        self.self_attn3 = SelfAttention(d_model)

        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        multi_headed = np.array_split(src.cpu().detach().numpy(), 3)

        m_src1 = self.self_attn1(torch.from_numpy(multi_headed[0]).to(device))
        m_src2 = self.self_attn2(torch.from_numpy(multi_headed[1]).to(device))
        m_src3 = self.self_attn3(torch.from_numpy(multi_headed[2]).to(device))

        src2 = torch.cat((m_src1, m_src2, m_src3))
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

In [None]:
class Predictor(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, context_words=sequence_length):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.att = TransformerLayer(embedding_dim)
        self.position_embedding = nn.Parameter(torch.Tensor(context_words, embedding_dim))
        nn.init.xavier_uniform_(self.position_embedding)

        self.lin2 = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.att2 = TransformerLayer(embedding_dim)

    # B = Batch size
    # W = Number of context words (left + right)
    # E = embedding_dim
    # V = num_embeddings (number of words)
    def forward(self, input):
        # input shape is (B, W)
        e = self.emb(input)
        # e shape is (B, W, E)
        u = e + self.position_embedding
        # u shape is (B, W, E)

        v = self.att(u)
        # v shape is (B, W, E)
        x = v.sum(dim=1)
        # x shape is (B, E)
        y = self.lin(x)

        # y shape is (B, V)
        return y


In [None]:
model = Predictor(len(vocabulary), embedding_dim).to(device)

In [None]:
def train(model, criterion, optimizer, n_epochs, device, show_every_n_batches=100):
    model.train()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    for batch_i, (inputs, labels) in enumerate(train_loader, 1):
      n_batches = len(train_loader.dataset)//batch_size
      if(batch_i > n_batches):
        break
      model.zero_grad()

      output = model(inputs)
      loss = criterion(output, labels)
      loss.backward()
      optimizer.step()
      # Training statistics
      total_loss += loss.item()
      ncorrect += (torch.max(output, 1)[1] == labels).sum().item()
      ntokens += labels.numel()
      niterations += 1
      if niterations == 200 or niterations == 500 or niterations % 1000 == 0:
          print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={100*ncorrect/ntokens:.1f}, loss={total_loss/ntokens:.2f}')

    total_loss = total_loss / ntokens
    accuracy = 100 * ncorrect / ntokens

    return accuracy, total_loss

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(reduction='sum')

train_accuracy = []
wiki_accuracy = []
valid_accuracy = []
for epoch_i in range(1, num_epochs + 1):
    acc, loss = train(model, criterion, optimizer, device, show_every_n_batches)
    train_accuracy.append(acc)
    print(f'| epoch {epoch_i:03d} | train accuracy={acc:.1f}%, train loss={loss:.2f}')

# Save model
torch.save(model.state_dict(), 'the_office_scripts')
print('Model Trained and Saved')

Train: wpb=400, num_updates=200, accuracy=27.5, loss=5.51
Train: wpb=400, num_updates=500, accuracy=34.2, loss=4.71
Train: wpb=400, num_updates=1000, accuracy=38.9, loss=4.21
Train: wpb=400, num_updates=2000, accuracy=42.1, loss=3.83
Train: wpb=400, num_updates=3000, accuracy=43.3, loss=3.68
| epoch 001 | train accuracy=43.8%, train loss=3.64
Train: wpb=400, num_updates=200, accuracy=49.1, loss=2.96
Train: wpb=400, num_updates=500, accuracy=48.6, loss=2.97
Train: wpb=400, num_updates=1000, accuracy=48.5, loss=2.98
Train: wpb=400, num_updates=2000, accuracy=48.2, loss=2.97
Train: wpb=400, num_updates=3000, accuracy=47.9, loss=3.00
| epoch 002 | train accuracy=47.8%, train loss=3.00
Train: wpb=400, num_updates=200, accuracy=50.3, loss=2.76
Train: wpb=400, num_updates=500, accuracy=49.7, loss=2.76
Train: wpb=400, num_updates=1000, accuracy=49.5, loss=2.77
Train: wpb=400, num_updates=2000, accuracy=49.1, loss=2.77
Train: wpb=400, num_updates=3000, accuracy=48.7, loss=2.80
| epoch 003 | tra

In [None]:
import torch.nn.functional as F

def generate(model, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    model.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        
        
        # get the output of the rnn
        current_seq= torch.tensor(current_seq).to(device)
        output = model(current_seq)

        # get the next word probabilities
        p = F.softmax(output, dim=1).data

        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.cpu().detach().numpy().squeeze()

        # select the likely next word index with some element of randomness
        p = p.cpu().detach().numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())

        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq.cpu().detach().numpy(), -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    gen_sentences = gen_sentences.replace('  ', ' ')
    
    # return all the sentences
    return gen_sentences

In [None]:
gen_length = 500 # modify the length to your preference
prime_word = 'Michael' # name for starting the script

token_dict= {
          '.':"||period||",
          ",":"||comma||",
          '"':"||doublequotationmark||",
          "'":"||quotationmark||",
          "?":"||questionmark||",
          "-":"||dash||",
          "\n":"||return||",
          ")":"||rightparentheses||",
          "(":"||leftparentheses||",
          "[":"||leftbracket||",
          "]":"||rightbracket||",
          ";":"||semicolon||",
          "!":"||exclamationmark||",
          ":": "||colon||",
          "%":"||percent||",
          "$":"||dollar||",
          "#":"||hashtag||",
          "/":"||forwardbar||"
      }

pad_word= ''
generated_script = generate(model, vocab_to_int[prime_word], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

Michael, I' d have to go to see it. I can do it for the altar. 

Pam: [ whispering] [ to hug] 

Kevin: [ laughing] I' m sorry. I' m gonna be happy. 

Dwight: I' m sorry, I' m gonna go ahead! 

Jim: I know what I' d say. 

Jim: I don' t know. 

Andy: [ dancing] ! 

Jim: Oh, I' m sorry, it' s really really really really. I' d love you to do anything but, and then I' ve been here. 

Dwight: Okay, well, it' s like a little dehydrated. It' s really good. 

Pam: [ dancing] I love your bride. 

Dwight: It' s just fine . . . . . . um. . . . . . . it' s Athleap. . . . . . . it was really nice. It' s like a fairy tale. It' s just like a fairy tale. [ applause] Oh, well, well, you' re gonna get it for me to the altar and I didn' t know. I just feel better. I can' t do anything. 

Jim: [ to hug] Oh my second? 

Dwight: No, no. 

Jim: I don' t want to see this. 

Dwight: [ laughs] I don' t wanna go to leave this. 

Jim: Oh, no, no. It was just so you had to be happy. 

Dwight: [ laughs] 

Pam: Okay