In [1]:
!pip install rouge-score
!pip install evaluate
# Comment out below if on Colab
!pip install torch --index-url https://download.pytorch.org/whl/cu118
!pip install -U scikit-learn


Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
import numpy as np
import pandas as pd
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import ConfusionMatrixDisplay as cmd
import matplotlib.pyplot as plt
import re
from rouge_score import rouge_scorer
#Reactivate if on Colab
#from google.colab import drive

In [3]:
gpu_available = torch.cuda.is_available()
if gpu_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
#Data Load
#Change or switch as required by your setup
#drive.mount('/content/drive')
#data = pd.read_csv('drive/My Drive/CS_539/bbc-news-data.csv', delimiter='\t')
#data = pd.read_csv('drive/My Drive/COMP SCI 539/bbc-news-data.csv', delimiter='\t')
data = pd.read_csv('./bbc-news-data.csv', delimiter='\t')

In [5]:
data.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...


In [6]:
print(data.isnull().sum())

# Data to lowercase
data["title"] = data["title"].str.lower()
data["content"] = data["content"].str.lower()

# List of contractions and acronyms to replace
contraction_dict = {"can't":"cannot","didn't":"did not","aren't":"are not","she'd":"she would","he'd":"he would","they'd":"they would","they've":"they have",
  "shouldn't":"should not","shouldn't've":"should not have","she'll":"she will","he'll":"he will","they'll":"they will", "ba's":"british airways",
  "g7":"group of seven"
}

category    0
filename    0
title       0
content     0
dtype: int64


In [7]:
# Preprocessing
def data_preprocessing(string, contraction_dict=contraction_dict):
  for word, replace in contraction_dict.items():
    string = string.replace(word, replace)
  string = re.sub(r"([.!?])", r" \1", string)
  string = re.sub(r"[^a-zA-Z!?]+", r" ", string)
  string = re.sub(r"\b(s )\b", r"", string)
  return string

max_len_content = 0
max_len_title = 0
for index in range(len(data)):
  data.loc[index,'title'] = data_preprocessing(data.loc[index,'title'])
  if len(data.loc[index,'title'].split()) > max_len_title:
    max_len_title = len(data.loc[index,'title'].split())
  data.loc[index,'content'] = data_preprocessing(data.loc[index,'content'])
  if len(data.loc[index,'content'].split()) > max_len_content:
    max_len_content = len(data.loc[index,'content'].split())


data.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,ad sales boost time warner profit,quarterly profits at us media giant timewarne...
1,business,002.txt,dollar gains on greenspan speech,the dollar has hit its highest level against ...
2,business,003.txt,yukos unit buyer faces loan claim,the owners of embattled russian oil giant yuk...
3,business,004.txt,high fuel prices hit british airways profits,british airways has blamed high fuel prices f...
4,business,005.txt,pernod takeover talk lifts domecq,shares in uk drinks and food firm allied dome...
5,business,006.txt,japan narrowly escapes recession,japan economy teetered on the brink of a tech...
6,business,007.txt,jobs growth still slow in the us,the us created fewer jobs than expected in ja...
7,business,008.txt,india calls for fair trade rules,india which attends the group of seven meetin...
8,business,009.txt,ethiopia crop production up,ethiopia produced million tonnes of crops in ...
9,business,010.txt,court rejects bn tobacco case,a us government claim accusing the country bi...


In [8]:
print(max_len_title, max_len_content)

9 4453


In [9]:
# Based on Transformer Tutorial
class convert:
  def __init__(self, category):
    self.category = category #title or content
    self.word_to_index = {"PAD": 0, "SOS": 1, "EOS": 2, "UNK": 3}
    self.index_to_word = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"}
    self.word_to_count = {}
    self.n_words = 4  # Count SOS and EOS


  def add_sentence(self, sentence):
    for word in sentence.split(' '):
      self.add_word(word)

  def add_word(self, word):
    if word not in self.word_to_index:
      self.word_to_index[word] = self.n_words
      self.word_to_count[word] = 1
      self.index_to_word[self.n_words] = word
      self.n_words += 1
    else:
      self.word_to_count[word] += 1

  def tokenize(self, sentence, seq_len=None):
    tokens_indexed = [self.word_to_index["SOS"]]

    for tkn in sentence.split():
      tokens_indexed.append(self.word_to_index[tkn if tkn in self.word_to_index else "UNK"])

    tokens_indexed.append(self.word_to_index["EOS"])

    # Pad or trim to desired lengh
    if seq_len is not None:
      if len(tokens_indexed) < seq_len:
        tokens_indexed += [self.word_to_index["PAD"]] * (seq_len - len(tokens_indexed))
      else:
         tokens_indexed = tokens_indexed[:seq_len]

    return tokens_indexed

  def list_to_sentence(self, seq_ids):
    return " ".join([self.index_to_word[idx] for idx in seq_ids])


title_vocab = convert("title")
content_vocab = convert("content")

for index in range(len(data)):
  title_vocab.add_sentence(data.loc[index,'title'])
  content_vocab.add_sentence(data.loc[index,'content'])

print(f"Title vocab contains {title_vocab.n_words} words.")
print(f"Content vocab contains {content_vocab.n_words} words.")

Title vocab contains 3686 words.
Content vocab contains 27771 words.


In [10]:
def data_loader(batch_size):
  n = 2225
  title_seqs_ids = torch.zeros((n, max_len_title)).long()
  content_seqs_ids = torch.zeros((n, max_len_content)).long()

  for index in range(len(data)):
    title_seqs_ids[index] = torch.tensor(title_vocab.tokenize(data.loc[index,'title'],seq_len=max_len_title))
    content_seqs_ids[index] = torch.tensor(content_vocab.tokenize(data.loc[index,'content'],seq_len=max_len_content))

  #Train_test_split
  X_train, X_test, y_train, y_test = train_test_split(content_seqs_ids,title_seqs_ids, train_size=0.6)


  train_dataset = TensorDataset(X_train.to(device), y_train.to(device))
  test_dataset = TensorDataset(X_test.to(device), y_test.to(device))

  training_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
  return training_loader, test_dataset


training_loader, test_data = data_loader(32)
for x,y in training_loader:
  print('Batch | Content =', x.shape, '| title =', y.shape)
  print('First sentence in contents: ', content_vocab.list_to_sentence(x[0].tolist()))
  print('First sentence in titles:', title_vocab.list_to_sentence(y[0].tolist()))
  break

Batch | Content = torch.Size([32, 4453]) | title = torch.Size([32, 9])
First sentence in contents:  SOS a group of mps and peers has called for a tightening of regulations controlling betting on sport the parliamentary group on betting and gaming held a substantial inquiry into betting last year it followed fears that a massive increase in betting on sport such as that done using the internet and mobile phones has led to more cheating the all party group recommended ways to protect punters and improve the integrity of sports betting they include a proposal for raising the maximum jail sentence for gambling cheats above the current two years lord condon head of the international cricket council anti corruption unit who originally made the call for longer prison sentences said the two year penalty was derisory you could get a bigger sentence for failing to pay your hotel bill criminally than you could for corruption in major sports symbolically a higher penalty perhaps as the bill passes

In [11]:
#Borrowed from Transformer Tutorial
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
    [np.sin(angle_rads), np.cos(angle_rads)],
    axis=-1)

  return pos_encoding

class word_pos_embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.01)
        self.pos_encoding = torch.Tensor(positional_encoding(length=2048, depth=d_model)).float().to(device)
        self.pos_encoding.requires_grad = False

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def forward(self, x):
        length = x.shape[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= (self.d_model ** 0.5)
        x = x + self.pos_encoding[None, :length, :]
        return x


embed_content = word_pos_embedding(vocab_size=content_vocab.n_words, d_model=512).to(device)
embed_title = word_pos_embedding(vocab_size=title_vocab.n_words, d_model=512).to(device)

#Testing Pos Embedding
title_sen = data.loc[1,'title']
title_seq = torch.tensor([title_vocab.word_to_index[w] for w in title_sen.split()]).unsqueeze(0)
print(title_seq.shape)
title_tkn_seq = embed_title(title_seq.to(device))
print(title_tkn_seq.shape)

content_sen = data.loc[1,'content']
content_seq = torch.tensor([content_vocab.word_to_index[w] for w in content_sen.split()]).unsqueeze(0)
print(content_seq.shape)
content_tkn_seq = embed_content(content_seq.to(device))
print(content_tkn_seq.shape)

torch.Size([1, 5])
torch.Size([1, 5, 512])
torch.Size([1, 382])
torch.Size([1, 382, 512])


In [12]:
#Encoder
"""
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):
        super().__init__()

        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_batch):
        embed = self.dropout(self.embedding(input_batch))
        outputs, hidden = self.rnn(embed)

        return hidden """

'\nclass Encoder(nn.Module):\n    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):\n        super().__init__()\n\n        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)\n        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(self, input_batch):\n        embed = self.dropout(self.embedding(input_batch))\n        outputs, hidden = self.rnn(embed)\n\n        return hidden '

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):
        super().__init__()

        self.hid_dim = hidden_dim
        self.num_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, inpt):
        embed = self.embedding(inpt)
        embed = self.dropout(embed)
        outputs, (hidden, cell) = self.rnn(embed)
        return hidden, cell

In [14]:
#Decoder
"""class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)
        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)
        #self.rnn = nn.LSTM(hidden_dim, vocab_size, n_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, target, hidden):
        x = self.embedding(target)
        x = self.dropout(x)
        x, (hidden, cell)= self.rnn(x, hidden)
        return x"""

'class Decoder(nn.Module):\n    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):\n        super().__init__()\n        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)\n        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)\n        #self.rnn = nn.LSTM(hidden_dim, vocab_size, n_layers, batch_first=True, dropout=dropout_rate)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(self, target, hidden):\n        x = self.embedding(target)\n        x = self.dropout(x)\n        x, (hidden, cell)= self.rnn(x, hidden)\n        return x'

In [15]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout_rate=0.1):
        super().__init__()
        
        self.output_dim = vocab_size
        self.hid_dim = hid_dim
        self.num_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, target, hidden, cell):
        #Shape of target: N to 1,N
        target = target.unsqueeze(0)
        x = self.embedding(target)
        x = self.dropout(x)
        x, hidden = self.rnn(x, (hidden, cell))
        out = self.fc(x)
        out = out.squeeze(0)
        return out, hidden

In [16]:
#Complete Model
class base_rnn(nn.Module):
    def __init__(self, hid_dim, embedding_dim, num_layers, input_vocab_size,
                 target_vocab_size, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = num_layers
        self.target_size = target_vocab_size
        self.input_size = input_vocab_size
        self.encoder = Encoder(input_vocab_size, embedding_dim, hid_dim,num_layers, dropout)
        self.decoder = Decoder(target_vocab_size, embedding_dim, hid_dim, num_layers, dropout)
        

    def forward(self, source, target, tfr = 0.5):
        
      batch_size = source.shape[0]
      target_len = target.shape[1]
      target_size = self.target_size
      hidden, cell = self.encoder(source)
      
      outputs = []
      x = target[0,0].unsqueeze(0)
      for i in range(0, target_len):
        output, (hidden,cell) = self.decoder(x, hidden, cell)
        outputs.append(output)#.argmax(1))
        #outputs.append(output)
        best_guess = output.argmax(1)
        target[0,i] if random.random() < tfr else best_guess

      return torch.transpose(torch.stack(outputs),0,1)

model = base_rnn( hid_dim = 512 ,embedding_dim = 512, num_layers=3,
                 input_vocab_size = content_vocab.n_words,
                 target_vocab_size= title_vocab.n_words,
                  dropout = 0.1).to(device)

for seq in test_data:
    title_seq = torch.reshape(seq[1],[1,9])
    content_seq = torch.reshape(seq[0],[1,4453])
    break

print('Batch of title Sentences:', title_seq.shape)
print('Batch of content paragraphs:', content_seq.shape)
#print('Output of Causal Self-Attention:', model(content_seq.to(device), title_seq.to(device)).shape)
mod_out = model(content_seq.to(device), title_seq.to(device))
print('Network shape: ', mod_out.shape)
print(mod_out)

Batch of title Sentences: torch.Size([1, 9])
Batch of content paragraphs: torch.Size([1, 4453])
Network shape:  torch.Size([1, 9, 3686])
tensor([[[ 0.0288,  0.0216,  0.0322,  ..., -0.0018, -0.0163,  0.0025],
         [ 0.0259,  0.0094,  0.0307,  ..., -0.0015, -0.0158,  0.0043],
         [ 0.0238,  0.0005,  0.0295,  ...,  0.0015, -0.0152,  0.0056],
         ...,
         [ 0.0235,  0.0032,  0.0333,  ...,  0.0061, -0.0066,  0.0101],
         [ 0.0276,  0.0009,  0.0328,  ...,  0.0092, -0.0099,  0.0135],
         [ 0.0326,  0.0027,  0.0380,  ...,  0.0090, -0.0110,  0.0155]]],
       device='cuda:0', grad_fn=<TransposeBackward0>)


In [17]:
out_seq = []
for i in range(len(mod_out[0])):
    out_seq.append(mod_out[0][i].argmax(0).tolist())
print(title_vocab.list_to_sentence(title_seq.tolist()[0]))
title_vocab.list_to_sentence(out_seq)


SOS bad e mail habits sustains spam EOS PAD


'williams increase increase buffy buffy buffy buffy buffy buffy'

In [18]:
# Counting end of sequence and start of sequence
title_seq_len = max_len_title + 2
content_seq_len = max_len_content + 2

In [19]:
for seq in training_loader:
  print(type(seq))
  break

<class 'list'>


In [40]:
#Training
def train_sequence(seq, model, optimizer):
    
    loss = nn.MSELoss()
    total_loss = 0
    X = seq[0]
    y = seq[1]
    for i in range(len(X)):
        
        batch_X = torch.reshape(X[i],[1,4453])
        batch_y = torch.reshape(y[i],[1,9])
        
        y_hat = Variable(model(batch_X,batch_y).float(), requires_grad=True)
        #print(y_hat)
        #break
        batch_y = F.one_hot(batch_y,title_vocab.n_words)
        optimizer.zero_grad()
        l = loss(y_hat, batch_y.float())
        total_loss += l.item()
        
        l.backward()
        optimizer.step()
    return total_loss


def fit(model, loader, lr, opt, num_epochs=100):
  optimizer = opt
  for epoch in range(num_epochs):
    total_loss = 0
    for sequence in loader:
      total_loss += train_sequence(sequence, model, optimizer)
      total_loss /= len(loader)
    print(f'Epoch {epoch} | perplexity {np.exp(total_loss):.2f}. Loss: {total_loss:.4f}')

    


hid_dim = 128
embed_dim = 512
n_layer = 3
batch_size = 30
num_epochs = 15
lr = 0.05
drop = 0.1
model = base_rnn( hid_dim = hid_dim ,embedding_dim = embed_dim, num_layers=n_layer,
                 input_vocab_size = content_vocab.n_words,
                 target_vocab_size= title_vocab.n_words,
                  dropout = drop).to(device)


training_loader, test_data = data_loader(batch_size)

In [41]:
optim = torch.optim.Adam(model.parameters(), lr)
fit(model, training_loader, lr, opt=optim, num_epochs=num_epochs)

Epoch 0 | Loss: 0.0012
Epoch 1 | Loss: 0.0012
Epoch 2 | Loss: 0.0012
Epoch 3 | Loss: 0.0012
Epoch 4 | Loss: 0.0012
Epoch 5 | Loss: 0.0012
Epoch 6 | Loss: 0.0012
Epoch 7 | Loss: 0.0012
Epoch 8 | Loss: 0.0012
Epoch 9 | Loss: 0.0012
Epoch 10 | Loss: 0.0012
Epoch 11 | Loss: 0.0012
Epoch 12 | Loss: 0.0012
Epoch 13 | Loss: 0.0012
Epoch 14 | Loss: 0.0012


In [42]:
#Rouge Testing
def rouge_test(model, test_data):
    rouge = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    
    #rouge = evaluate.load('rouge')
    pred = []
    real = []
    scores = []
    for seq in test_data:
        X = torch.reshape(seq[0],[1,4453])
        y = torch.reshape(seq[1],[1,9])
        real.append(y[0].cpu())
        y_pred = model(X,y).argmax(1)
        #print(y_pred.argmax(1))
        pred.append(y_pred[0].cpu())
        scores.append(rouge.score(title_vocab.list_to_sentence(y[0].tolist())
                                 ,title_vocab.list_to_sentence(y_pred[0].tolist())))
    #results = rouge.compute(predictions=predictions, references=references)
    #results = 
    return scores, pred, real
        

In [43]:
rouge_results, pred, real = rouge_test(model, test_data)
rouge_results

[{'rouge1': Score(precision=0.0016277807921866521, recall=0.6666666666666666, fmeasure=0.003247631935047361)},
 {'rouge1': Score(precision=0.0010851871947911015, recall=0.4444444444444444, fmeasure=0.0021650879566982406)},
 {'rouge1': Score(precision=0.0010851871947911015, recall=0.4444444444444444, fmeasure=0.0021650879566982406)},
 {'rouge1': Score(precision=0.0010851871947911015, recall=0.4444444444444444, fmeasure=0.0021650879566982406)},
 {'rouge1': Score(precision=0.0013564839934888768, recall=0.5555555555555556, fmeasure=0.0027063599458728013)},
 {'rouge1': Score(precision=0.0005425935973955507, recall=0.2222222222222222, fmeasure=0.0010825439783491203)},
 {'rouge1': Score(precision=0.0010851871947911015, recall=0.4444444444444444, fmeasure=0.0021650879566982406)},
 {'rouge1': Score(precision=0.0013564839934888768, recall=0.5555555555555556, fmeasure=0.0027063599458728013)},
 {'rouge1': Score(precision=0.0010851871947911015, recall=0.4444444444444444, fmeasure=0.0021650879566982

In [44]:
print("Size of test set: ", len(rouge_results))
avg_precision = 0
avg_recall = 0
avg_f1 = 0
for i in range(len(rouge_results)):
    avg_precision += rouge_results[i]['rouge1'][0]
    avg_recall += rouge_results[i]['rouge1'][1]
    avg_f1 += rouge_results[i]['rouge1'][2]

avg_precision = avg_precision /len(rouge_results)
avg_recall = avg_recall /len(rouge_results)
avg_f1 = avg_f1 /len(rouge_results)
print(f'Average Test Precision: {avg_precision:.4f}, Average Test recall: {avg_recall:.4f}, Average Test F1 score {avg_f1:.4f}')

Size of test set:  890
Average Test Precision: 0.0010, Average Test recall: 0.4233, Average Test F1 score 0.0021
