# Task1 Dataset Acquisition

The dataset that I choose is about Liverpool F.C. from wikipedia

In [1]:
import wikipediaapi #pip install wikipedia-api
import re

In [2]:
# en for English
wiki = wikipediaapi.Wikipedia('A2 (st124482@ait.asia)','en')
# load Liverpool F.C. content
page = wiki.page('Liverpool F.C.')
# page.text

In [3]:
# clean data
cleaned_text = page.text.replace('\n\n', ' ')
cleaned_text = cleaned_text.replace('\n', ' ')
cleaned_text = cleaned_text.replace('.  ', '.')
cleaned_text = cleaned_text.replace('.', '. ')



# delete some text that it not be a sentence
# find the index of "Minor titles"
start_index = cleaned_text.find("Minor titles")

# If "Minor titles" is found, slice the string to remove the content from that point onward
if start_index != -1:
    cleaned_text = cleaned_text[:start_index]

    # Print the cleaned text
    print(cleaned_text)
else:
    print("String 'Minor titles' not found.")




Liverpool Football Club is a professional football club based in Liverpool, England.  The club competes in the Premier League, the top tier of English football.  Founded in 1892, the club joined the Football League the following year and has played its home games at Anfield since its formation. Domestically, the club has won 19 league titles, eight FA Cups, a record nine League Cups and 16 FA Community Shields.  In international competitions, the club has won six European Cups, three UEFA Cups, four UEFA Super Cups—all English records—and one FIFA Club World Cup.  The club established itself as a major force in domestic and European football in the 1970s and 1980s, when Bill Shankly, Bob Paisley, Joe Fagan and Kenny Dalglish, led the club to a combined 11 League titles and four European Cups.  Liverpool won two further European Cups in 2005 and 2019 under the management of Rafael Benítez and Jürgen Klopp, respectively; the latter led Liverpool to a 19th league title in 2020, the club's

# Task2 Model Training

In [4]:
# import the neccessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import torchtext, math

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# check that I have cpu for train or not
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# same pattern when I restart the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


cpu


### Task2.1 Preprocessing


Step1: Tokenize the text dataset to sentence and split the dataset on train, test, and validation set


In [6]:
import nltk
from sklearn.model_selection import train_test_split

In [7]:
# download pubkt which is pre-trained for tokenizing text into sentences
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\earth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
sentences =nltk.sent_tokenize(cleaned_text)
# sentences

In [9]:
# set the random seed for reproducibility
random_seed = 42

# split the data into training, testing, and validation sets
train_data, test_data = train_test_split(sentences, test_size=0.15, random_state=random_seed)
train_data, val_data = train_test_split(train_data, test_size=0.15, random_state=random_seed)

# Print the sizes of the sets
print(f"Number of samples in training set: {len(train_data)}")
print(f"Number of samples in validation set: {len(val_data)}")
print(f"Number of samples in test set: {len(test_data)}")

Number of samples in training set: 187
Number of samples in validation set: 33
Number of samples in test set: 39


In [10]:
train_data[2]

'The last player to be transferred between the two clubs was Phil Chisnall, who moved to Liverpool from Manchester United in 1964.'

Step2: Tokenize the sentence on train, test, and validation dataset to tokens (individual) 

In [11]:
# tokenize to tranform sentence to tokens
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# create function to tokenize the text
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example)}  

# map the function to each example in the list
tokenized_train_data = list(map(lambda example: tokenize_data(example, tokenizer), train_data))
tokenized_test_data = list(map(lambda example: tokenize_data(example, tokenizer), test_data))
tokenized_val_data = list(map(lambda example: tokenize_data(example, tokenizer), val_data))
# tokenized_train_data 

Step3: Numericalizing <br>
create list that keep all vocab and add `unk` and `eos` to that list

In [12]:
# create nested list that is suitable format for build_vocab_from_iterator function 
tokenized_train_dataset = [entry['tokens'] for entry in tokenized_train_data]
tokenized_test_dataset = [entry['tokens'] for entry in tokenized_test_data]
tokenized_val_dataset = [entry['tokens'] for entry in tokenized_val_data]
# tokenized_train_dataset

In [13]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_train_dataset)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1) #for print next word
vocab.set_default_index(vocab['<unk>']) #word that not in vocab tranfer to <unk>

In [14]:
# save the vocabulary to a file
torch.save(vocab, 'vocab.pt')

In [15]:
# print the number of vocab in total
print(len(vocab))

1211


In [16]:
# print 10 vocabs
print(vocab.get_itos()[:10])

['<unk>', '<eos>', 'the', ',', '.', 'in', 'and', 'of', 'liverpool', 'to']


Step4: Prepare data by separate data to batch

For example, I have 2 sentences which are "We have to go now" and "we will go to supermarket" . I assign batch size = 3, we will get three batches of data "We have to go", "now `<eos>` we will", "go to supermarket `<eos>`".  

In [17]:
# function for split the dataset on batch
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example:
            tokens = example.append('<eos>') # add <eos> to the end of each sentence
            tokens = [vocab[token] for token in example] # change each word to number
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size] # to make sure that every batch is equal
    data = data.view(batch_size, num_batches) #reshape 
    return data #[batch size, seq len]

In [18]:
batch_size = 16
train_data = get_data(tokenized_train_dataset, vocab, batch_size)
valid_data = get_data(tokenized_val_dataset, vocab, batch_size)
test_data  = get_data(tokenized_test_dataset,  vocab, batch_size)

In [19]:
train_data

tensor([[   5,  494,    3,  ...,   98,  110,  483],
        [   4,    1,    5,  ...,  461,    4,    1],
        [   8,   14,   16,  ...,   77,  918,   17],
        ...,
        [ 113,   12,  591,  ...,    2,  288,   18],
        [   3,   43,  438,  ...,   17,    2,   10],
        [   9, 1012,    6,  ...,  898,   17,  712]])

In [20]:
train_data.shape

torch.Size([16, 297])

### Task2.2 Model architecture and the training process

<img src="LM.png" width=600>

For Language Model assignment, I create the LSTMLanguageModel class for build model. it consists of embedding layer to tranform the input tokens/words into number vector, multiple stacked LSTM layers to learn long-range dependencied in sequential data, dropout layers to cut out some data to prevent overfitting, and linear layer to tranforms the output from LSTM layer into prediction of next word of the sequence.

In [21]:
# create model 
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size) # fc is the last layer for 
        
        self.init_weights()
    
    # function for assigning the initial weight of W_e, W_h
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #W_e
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #W_h
    
    # reset hidden
    def init_hidden(self, batch_size, device): 
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() 
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #Liverpool is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

In [22]:
# assign the parameters
vocab_size = len(vocab)
emb_dim = 1024                
hid_dim = 50                
num_layers = 1               
dropout_rate = 0.5             
lr = 1e-3                     

In [23]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')



The model has 1,517,025 trainable parameters


In [24]:
# the function is used for getting the input and output batch for training process
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [25]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()

    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad() #clear all gradient
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        #get the input and output batch
        src, target = get_batch(data, seq_len, idx) 
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        #put it on LSTM model that I created and printthe prediction
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        #clipping to make gradient smaller to prevent exploding gradient 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        #update the model parameter
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    # average the training loss
    return epoch_loss / num_batches

In [26]:
# function for evaluate the model with validation dataset
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len] 
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    # average the validation loss
    return epoch_loss / num_batches

In [27]:
# training
n_epochs = 50
seq_len  = 30 #<----decoding length
clip    = 0.25

# to reduce the learning rate when the loss is not improve
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    # save the model if the validation loss of model is improve
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    #print the train and validation Perplexity (lower, better)
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                       

	Train Perplexity: 1142.536
	Valid Perplexity: 901.910


                                                       

	Train Perplexity: 1032.475
	Valid Perplexity: 809.273


                                                       

	Train Perplexity: 817.584
	Valid Perplexity: 614.836


                                                       

	Train Perplexity: 544.780
	Valid Perplexity: 449.423


                                                       

	Train Perplexity: 404.232
	Valid Perplexity: 375.537


                                                       

	Train Perplexity: 334.085
	Valid Perplexity: 350.076


                                                       

	Train Perplexity: 304.907
	Valid Perplexity: 343.530


                                                       

	Train Perplexity: 283.143
	Valid Perplexity: 342.620


                                                       

	Train Perplexity: 266.303
	Valid Perplexity: 340.896


                                                       

	Train Perplexity: 256.834
	Valid Perplexity: 334.361


                                                       

	Train Perplexity: 241.427
	Valid Perplexity: 327.129


                                                       

	Train Perplexity: 223.366
	Valid Perplexity: 319.450


                                                       

	Train Perplexity: 216.301
	Valid Perplexity: 311.721


                                                       

	Train Perplexity: 200.075
	Valid Perplexity: 304.044


                                                       

	Train Perplexity: 190.555
	Valid Perplexity: 297.324


                                                       

	Train Perplexity: 178.566
	Valid Perplexity: 292.302


                                                       

	Train Perplexity: 168.387
	Valid Perplexity: 286.887


                                                       

	Train Perplexity: 159.368
	Valid Perplexity: 282.324


                                                       

	Train Perplexity: 149.912
	Valid Perplexity: 279.102


                                                       

	Train Perplexity: 146.034
	Valid Perplexity: 274.596


                                                       

	Train Perplexity: 135.602
	Valid Perplexity: 270.942


                                                       

	Train Perplexity: 130.962
	Valid Perplexity: 266.916


                                                       

	Train Perplexity: 125.794
	Valid Perplexity: 262.821


                                                       

	Train Perplexity: 119.092
	Valid Perplexity: 260.614


                                                       

	Train Perplexity: 114.907
	Valid Perplexity: 259.756


                                                       

	Train Perplexity: 110.593
	Valid Perplexity: 257.258


                                                       

	Train Perplexity: 106.864
	Valid Perplexity: 254.948


                                                       

	Train Perplexity: 101.389
	Valid Perplexity: 252.320


                                                       

	Train Perplexity: 98.313
	Valid Perplexity: 249.898


                                                       

	Train Perplexity: 94.306
	Valid Perplexity: 249.145


                                                       

	Train Perplexity: 90.349
	Valid Perplexity: 246.677


                                                       

	Train Perplexity: 86.481
	Valid Perplexity: 246.584


                                                       

	Train Perplexity: 85.191
	Valid Perplexity: 245.587


                                                       

	Train Perplexity: 84.559
	Valid Perplexity: 244.960


                                                       

	Train Perplexity: 82.153
	Valid Perplexity: 245.129


                                                       

	Train Perplexity: 81.158
	Valid Perplexity: 245.129


                                                       

	Train Perplexity: 80.718
	Valid Perplexity: 245.104


                                                       

	Train Perplexity: 79.249
	Valid Perplexity: 245.013


                                                       

	Train Perplexity: 79.796
	Valid Perplexity: 244.972


                                                       

	Train Perplexity: 79.440
	Valid Perplexity: 244.943


                                                       

	Train Perplexity: 78.488
	Valid Perplexity: 244.927


                                                       

	Train Perplexity: 79.227
	Valid Perplexity: 244.921


                                                       

	Train Perplexity: 79.550
	Valid Perplexity: 244.917


                                                       

	Train Perplexity: 79.859
	Valid Perplexity: 244.914


                                                       

	Train Perplexity: 79.737
	Valid Perplexity: 244.914


                                                       

	Train Perplexity: 78.715
	Valid Perplexity: 244.913


                                                       

	Train Perplexity: 79.719
	Valid Perplexity: 244.913


                                                       

	Train Perplexity: 79.447
	Valid Perplexity: 244.913


                                                       

	Train Perplexity: 79.074
	Valid Perplexity: 244.913


                                                       

	Train Perplexity: 78.796
	Valid Perplexity: 244.913




In [28]:
# test model with test dataset

model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 247.795


# Task 3 Demo function before creating deployment

In [29]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens] # tranforms word to number (index in vocabs)
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [30]:
# load saved model
loaded_model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate)
loaded_model.load_state_dict(torch.load('best-val-lstm_lm.pt'))
loaded_model.eval()



LSTMLanguageModel(
  (embedding): Embedding(1211, 1024)
  (lstm): LSTM(1024, 50, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=50, out_features=1211, bias=True)
)

In [36]:
prompt = 'Liverpool is'
max_seq_len = 30
seed = 0
# temperature = 1 # since we want the most make-sense sentence, temperature must highest which is 1

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, loaded_model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
liverpool is the club .

0.7
liverpool is the club and suspicion of the club , and , and overcrowding owner of the club ' s and everton .

0.75
liverpool is the club and suspicion of the fifa club has , and overcrowding owner of the club ' s and everton .

0.8
liverpool is the club and suspicion of his men , reuben , and overcrowding featuring could george increase of colour and everton also multiple-winner league days , as the all he to

1.0
liverpool is the club also suspicion of his men , reuben , and overcrowding featuring could george rivalries addorsed colour and everton also multiple-winner league days , as on all he football



In web interface, it need to show only one answer that is the sentence with highest temparature which is 1