In [1]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.optim import Adam
import re
import spacy

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.DEBUG)

from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print("running on gpu!!!")
else:
    print("cpu :(")

if 'COLAB_GPU' in os.environ:
        print("running on colab")
        from google.colab import drive
        drive.mount('/content/drive')
        import sys
        sys.path.append("/content/drive/MyDrive/Colab Notebooks/shakespeare")

from utils import ModelJob

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


class ShakespeareDataset(Dataset):
    
    def __init__(self, df, seq_len, mode=None, vocab=None, vocab_ctoi=None):
        self.df = df
        self.seq_len = seq_len
        self.mode = mode
        self.df["name"] = self.df["name"].str.lower()
        self.df['name'] = self.df["name"].str.replace("\n", " ")
        self.df['name'] = self.df["name"].str.replace("[^\w+\s]", "")
        #logging.info("Building vocabulary")
        if self.mode == "train":
            self.vocab, self.vocab_ctoi = self.build_vocab()
        else:
            self.vocab = vocab
            self.vocab_ctoi = vocab_ctoi
        #logging.info("Creating indices")
        self.df["input_idx"] = self.df["name"].apply(lambda x: [self.vocab[t] for t in x if t in self.vocab.keys()])
        self.vocab_size = len(self.vocab)+1
        #logging.info("Creating sequences")
        if self.mode != "predict":
            self.df["sequences"] = self.df["input_idx"].apply(self.create_sequence)
            self.df_sequences = self.df.loc[:, ["sequences"]].explode("sequences").reset_index(drop=True)
            self.df_sequences["len"] = self.df_sequences["sequences"].apply(len)-1
        else:
            self.df_sequences = self.df
            self.df_sequences.rename(columns = {"input_idx": "sequences"}, inplace=True)
            self.df_sequences["len"] = self.df_sequences["sequences"].apply(len)
        self.df_sequences = self.df_sequences.loc[self.df_sequences["len"]>0]
        
    def create_sequence(self, row):
        sequences = []
        char_len = len(row)
        for idx in range(char_len):
            sequence = row[idx:idx+self.seq_len+1]
            sequences.append(sequence)
        return sequences
    
    def build_vocab(self):
        char_series = self.df["name"].apply(lambda x: np.array(list(x)))
        chars = np.unique(np.concatenate(char_series.values))
        vocab = {char:idx+1 for idx, char in enumerate(chars)}
        vocab_ctoi = {idx+1:char for idx, char in enumerate(chars)}
        return vocab, vocab_ctoi
            
    def __len__(self):
        return self.df_sequences.shape[0]
    
    def __getitem__(self, idx):
        df_batch = self.df_sequences.iloc[idx]
        if self.mode!="predict":
            return {"X": df_batch["sequences"][:-1],
                    "len": df_batch["len"],
                    "y": df_batch["sequences"][1:]}
        else:
            return torch.tensor(df_batch["sequences"]).to(device),df_batch["len"]

def collate_fn(batch, mode):
    X = [torch.tensor(row["X"]) for row in batch]
    lens = [row["len"] for row in batch]
    
    if mode == "predict":
        return X, lens
    else:
        padded_X = pad_sequence(X, batch_first=True, padding_value=0)
        y = [torch.tensor(row["y"]) for row in batch]
        padded_y = pad_sequence(y, batch_first=True, padding_value=0)
        return padded_X.to(device), lens, torch.tensor(padded_y).to(device).reshape(-1)

class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, 
                hidden_size):
        super(LanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                     embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, 
                          hidden_size=hidden_size,
                          bidirectional=False,
                          batch_first=True
                         )
        self.dropout = nn.Dropout(p=0.2)
        self.fc_1 = nn.Linear(in_features=hidden_size,
                             out_features=hidden_size//2)
        self.fc = nn.Linear(in_features=hidden_size//2, 
                           out_features=self.vocab_size)
    def forward(self, x, h_t_1=None, mode="train"):
        if len(x) == 2:
            (seq, lens) = x
        else:
            (seq, lens, state) = x
        emb = self.embedding(seq)
        emb_packed = pack_padded_sequence(emb, lengths=lens, 
                                          batch_first=True,
                                         enforce_sorted=False)
        if h_t_1 is not None:
            output, h_t = self.rnn(emb_packed, h_t_1)
        else:
            output, h_t = self.rnn(emb_packed)
        output, out_lengths = pad_packed_sequence(output, batch_first=True)
        output = self.dropout(output)
        fc_out  = F.relu(self.fc_1(output.reshape(output.shape[0]*output.shape[1], -1)))
        logits = self.fc(fc_out)
        if mode != "train":
            logits = logits.contiguous().view(output.shape[0], output.shape[1], -1)
        return logits, h_t, out_lengths


if __name__ == "__main__":
    if "COLAB_GPU" in os.environ:
        logging.info("Running on colab")
        data_path = "/content/drive/MyDrive/Colab Notebooks/shakespeare/sonnet.txt"
        model_save_path = "/content/drive/MyDrive/Colab Notebooks/shakespeare/models/"
    else:
        logging.info("Running on local")
        data_path = "data/shakespeare/sonnet.txt"
        model_save_path = "models/"
    df = pd.read_table(data_path, header=None, names = ["name"])
    df = df.sample(n=df.shape[0], random_state=9)
    df_train = df.iloc[0: int(0.8*df.shape[0])]
    df_test = df.iloc[int(0.8*df.shape[0]):]
    logging.info(f"Dataframe shapes: {df_train.shape}, {df_test.shape}")
    train_ds = ShakespeareDataset(df=df_train, seq_len=10, mode="train")
    test_ds = ShakespeareDataset(df=df_test, seq_len=10, mode="test", 
                             vocab=train_ds.vocab, vocab_ctoi=train_ds.vocab_ctoi)

train_dl = DataLoader(dataset=train_ds, 
                        batch_size=256,
                        shuffle=True,
                        collate_fn=lambda x: collate_fn(x, mode="train")
                        )
test_dl = DataLoader(dataset=test_ds, 
                     batch_size=256,
                     shuffle=True,
                     collate_fn=lambda x: collate_fn(x, mode="test")
                     )

model = LanguageModel(vocab_size=train_ds.vocab_size,
                     embedding_dim=100,
                     hidden_size=128)
model = model.to(device)
                        
loss_func = nn.CrossEntropyLoss(ignore_index=0)
optimizer = Adam(params=model.parameters(), lr=0.001)

model_run = ModelJob(model=model,
                    dataloaders={"train":train_dl,
                                 "test": test_dl},
                    model_save_path=model_save_path,
                    criterion=loss_func,
                    optimizer=optimizer,
                    n_epochs=30,
                    phases=["train","test"],
                    model_save_name="language_model_sp_gru.pth")
model_run.train_step()

INFO:root:Running on local
INFO:root:Dataframe shapes: (3694, 1), (924, 1)


cpu :(
cpu :(
EPOCH: 1 out of 30
||||||||||||||||||||||||||||
	MODE: train : LOSS: 2.0143234729766846 : ACCURACY: 0.3481147289276123
|||||||
	MODE: test : LOSS: 1.7622320652008057 : ACCURACY: 0.4081234335899353
Best epoch : 1
Saving model : language_model_sp_gru.pth at models/
EPOCH: 2 out of 30
||||||||||||||||||||||||||||
	MODE: train : LOSS: 1.730285882949829 : ACCURACY: 0.4083976745605469
|||||||
	MODE: test : LOSS: 1.6741187572479248 : ACCURACY: 0.42755380272865295
Best epoch : 2
Saving model : language_model_sp_gru.pth at models/
EPOCH: 3 out of 30
||||||||||||||||||||||||||||
	MODE: train : LOSS: 1.6652040481567383 : ACCURACY: 0.42180824279785156
|||||||
	MODE: test : LOSS: 1.6366883516311646 : ACCURACY: 0.4350336492061615
Best epoch : 3
Saving model : language_model_sp_gru.pth at models/
EPOCH: 4 out of 30
||||||||||||||||||||||||||||
	MODE: train : LOSS: 1.628257155418396 : ACCURACY: 0.4302244186401367
|||||||
	MODE: test : LOSS: 1.6115983724594116 : ACCURACY: 0.44062280654907

In [2]:
def generate_text(model, seed_text,  seq_len, temperature):
    model.eval()
    for idx in range(seq_len):
        df_text = pd.DataFrame([seed_text], columns=["name"])

        predict_ds = ShakespeareDataset(df=df_text, seq_len=10, 
                                    mode="predict", 
                                    vocab=train_ds.vocab,
                                   vocab_ctoi = train_ds.vocab_ctoi)
        
        predict_dl = DataLoader(dataset=predict_ds, 
                             batch_size=1,
                             shuffle=True
                             )
        if idx == 0:
            logits, state, out_lens = model_run.predict_step(predict_dl, if_y=False)
        else:
            logits, state, out_lens =  model_run.predict_step(predict_dl, h_t_1=state,  
                                                    if_y=False, mode="predict")
        predicted_char_idx = torch.multinomial(F.softmax(logits[0, -1]/temperature), num_samples=1).item()
        predicted_char = train_ds.vocab_ctoi[predicted_char_idx]
        seed_text = seed_text+predicted_char
    return seed_text

for seed in ["time", "bless", "think", "thou", "tender"]:
    generated_text  = generate_text(model = model_run.model, 
                                    seed_text=seed, seq_len=50, 
                                    temperature=0.9)
    print(f"seed: {seed}, generated: {generated_text}")

seed: time, generated: times foll rest sail features nights bear alleath thee
seed: bless, generated: blessest restand that lays i perfumes eye thee than hat
seed: think, generated: think do delight which i snot nor the selffied i have s
seed: thou, generated: thou perpose wishd all that tis are diseasd from my lo
seed: tender, generated: tender a true md against the thing more still are eyes u
