In [1]:
!pip install rouge-score
!pip install evaluate
# Comment out below if on Colab
!pip install torch --index-url https://download.pytorch.org/whl/cu118
!pip install -U scikit-learn


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/1c/49/30ffcac5af06d08dfdd27da322ce31a373b733711bb272941877c1e4794a/scikit_learn-1.3.2-cp39-cp39-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp39-cp39-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp39-cp39-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.3 MB 991.0 kB/s eta 0:00:10
    --------------------------------------- 0.2/9.3 MB 2.8 MB/s eta 0:00:04


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
import re
import evaluate
#Reactivate if on Colab
#from google.colab import drive

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
gpu_available = torch.cuda.is_available()
if gpu_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
#Data Load
#Change or switch as required by your setup
#drive.mount('/content/drive')
#data = pd.read_csv('drive/My Drive/CS_539/bbc-news-data.csv', delimiter='\t')
#data = pd.read_csv('drive/My Drive/COMP SCI 539/bbc-news-data.csv', delimiter='\t')
data = pd.read_csv('./bbc-news-data.csv', delimiter='\t')

In [5]:
data.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...


In [6]:
print(data.isnull().sum())

# Data to lowercase
data["title"] = data["title"].str.lower()
data["content"] = data["content"].str.lower()

# List of contractions and acronyms to replace
contraction_dict = {"can't":"cannot","didn't":"did not","aren't":"are not","she'd":"she would","he'd":"he would","they'd":"they would","they've":"they have",
  "shouldn't":"should not","shouldn't've":"should not have","she'll":"she will","he'll":"he will","they'll":"they will", "ba's":"british airways",
  "g7":"group of seven"
}

category    0
filename    0
title       0
content     0
dtype: int64


In [7]:
# Preprocessing
def data_preprocessing(string, contraction_dict=contraction_dict):
  for word, replace in contraction_dict.items():
    string = string.replace(word, replace)
  string = re.sub(r"([.!?])", r" \1", string)
  string = re.sub(r"[^a-zA-Z!?]+", r" ", string)
  string = re.sub(r"\b(s )\b", r"", string)
  return string

max_len_content = 0
max_len_title = 0
for index in range(len(data)):
  data.loc[index,'title'] = data_preprocessing(data.loc[index,'title'])
  if len(data.loc[index,'title'].split()) > max_len_title:
    max_len_title = len(data.loc[index,'title'].split())
  data.loc[index,'content'] = data_preprocessing(data.loc[index,'content'])
  if len(data.loc[index,'content'].split()) > max_len_content:
    max_len_content = len(data.loc[index,'content'].split())


data.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,ad sales boost time warner profit,quarterly profits at us media giant timewarne...
1,business,002.txt,dollar gains on greenspan speech,the dollar has hit its highest level against ...
2,business,003.txt,yukos unit buyer faces loan claim,the owners of embattled russian oil giant yuk...
3,business,004.txt,high fuel prices hit british airways profits,british airways has blamed high fuel prices f...
4,business,005.txt,pernod takeover talk lifts domecq,shares in uk drinks and food firm allied dome...
5,business,006.txt,japan narrowly escapes recession,japan economy teetered on the brink of a tech...
6,business,007.txt,jobs growth still slow in the us,the us created fewer jobs than expected in ja...
7,business,008.txt,india calls for fair trade rules,india which attends the group of seven meetin...
8,business,009.txt,ethiopia crop production up,ethiopia produced million tonnes of crops in ...
9,business,010.txt,court rejects bn tobacco case,a us government claim accusing the country bi...


In [8]:
print(max_len_title, max_len_content)

9 4453


In [9]:
# Based on Transformer Tutorial
class convert:
  def __init__(self, category):
    self.category = category #title or content
    self.word_to_index = {"PAD": 0, "SOS": 1, "EOS": 2, "UNK": 3}
    self.index_to_word = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"}
    self.word_to_count = {}
    self.n_words = 4  # Count SOS and EOS


  def add_sentence(self, sentence):
    for word in sentence.split(' '):
      self.add_word(word)

  def add_word(self, word):
    if word not in self.word_to_index:
      self.word_to_index[word] = self.n_words
      self.word_to_count[word] = 1
      self.index_to_word[self.n_words] = word
      self.n_words += 1
    else:
      self.word_to_count[word] += 1

  def tokenize(self, sentence, seq_len=None):
    tokens_indexed = [self.word_to_index["SOS"]]

    for tkn in sentence.split():
      tokens_indexed.append(self.word_to_index[tkn if tkn in self.word_to_index else "UNK"])

    tokens_indexed.append(self.word_to_index["EOS"])

    # Pad or trim to desired lengh
    if seq_len is not None:
      if len(tokens_indexed) < seq_len:
        tokens_indexed += [self.word_to_index["PAD"]] * (seq_len - len(tokens_indexed))
      else:
         tokens_indexed = tokens_indexed[:seq_len]

    return tokens_indexed

  def list_to_sentence(self, seq_ids):
    return " ".join([self.index_to_word[idx] for idx in seq_ids])


title_vocab = convert("title")
content_vocab = convert("content")

for index in range(len(data)):
  title_vocab.add_sentence(data.loc[index,'title'])
  content_vocab.add_sentence(data.loc[index,'content'])

print(f"Title vocab contains {title_vocab.n_words} words.")
print(f"Content vocab contains {content_vocab.n_words} words.")

Title vocab contains 3686 words.
Content vocab contains 27771 words.


In [10]:
def data_loader(batch_size):
  n = 2225
  title_seqs_ids = torch.zeros((n, max_len_title)).long()
  content_seqs_ids = torch.zeros((n, max_len_content)).long()

  for index in range(len(data)):
    title_seqs_ids[index] = torch.tensor(title_vocab.tokenize(data.loc[index,'title'],seq_len=max_len_title))
    content_seqs_ids[index] = torch.tensor(content_vocab.tokenize(data.loc[index,'content'],seq_len=max_len_content))

  #Train_test_split
  X_train, X_test, y_train, y_test = train_test_split(content_seqs_ids,title_seqs_ids, train_size=0.6)


  train_dataset = TensorDataset(X_train.to(device), y_train.to(device))
  test_dataset = TensorDataset(X_test.to(device), y_test.to(device))

  training_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
  return training_loader, test_dataset


training_loader, test_data = data_loader(32)
for x,y in training_loader:
  print('Batch | Content =', x.shape, '| title =', y.shape)
  print('First sentence in contents: ', content_vocab.list_to_sentence(x[0].tolist()))
  print('First sentence in titles:', title_vocab.list_to_sentence(y[0].tolist()))
  break

Batch | Content = torch.Size([32, 4453]) | title = torch.Size([32, 9])
First sentence in contents:  SOS another front in the on going battle between microsoft and google is about to be opened by the end of microsoft aims to launch search software to find any kind of file on a pc hard drive the move is in answer to google release of its own search tool that catalogues data on desktop pcs the desktop search market is becoming increasingly crowded as google aol yahoo and many smaller firms tout programs that help people find files microsoft made the announcement about its forthcoming search software during a call to financial analysts to talk about its first quarter results john connors microsoft chief financial officer said a test version of its desktop search software should be available for download by the end of the year we re going to have a heck of a great race in search between google microsoft and yahoo he said it going to be really fun to follow microsoft is coming late to the de

In [11]:
#Borrowed from Transformer Tutorial
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
    [np.sin(angle_rads), np.cos(angle_rads)],
    axis=-1)

  return pos_encoding

class word_pos_embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.01)
        self.pos_encoding = torch.Tensor(positional_encoding(length=2048, depth=d_model)).float().to(device)
        self.pos_encoding.requires_grad = False

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def forward(self, x):
        length = x.shape[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= (self.d_model ** 0.5)
        x = x + self.pos_encoding[None, :length, :]
        return x


embed_content = word_pos_embedding(vocab_size=content_vocab.n_words, d_model=512).to(device)
embed_title = word_pos_embedding(vocab_size=title_vocab.n_words, d_model=512).to(device)

#Testing Pos Embedding
title_sen = data.loc[1,'title']
title_seq = torch.tensor([title_vocab.word_to_index[w] for w in title_sen.split()]).unsqueeze(0)
print(title_seq.shape)
title_tkn_seq = embed_title(title_seq.to(device))
print(title_tkn_seq.shape)

content_sen = data.loc[1,'content']
content_seq = torch.tensor([content_vocab.word_to_index[w] for w in content_sen.split()]).unsqueeze(0)
print(content_seq.shape)
content_tkn_seq = embed_content(content_seq.to(device))
print(content_tkn_seq.shape)

torch.Size([1, 5])
torch.Size([1, 5, 512])
torch.Size([1, 382])
torch.Size([1, 382, 512])


In [12]:
#Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):
        super().__init__()

        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_batch):
        embed = self.dropout(self.embedding(input_batch))
        outputs, hidden = self.rnn(embed)

        return hidden

In [13]:
#Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.embedding = word_pos_embedding(vocab_size, hidden_dim)
        self.rnn = nn.LSTM(hidden_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, target, hidden):
        x = self.embedding(target)
        x = self.dropout(x)
        x, (hidden, cell)= self.rnn(x, hidden)
        return x

In [14]:
#Complete Model
class base_rnn(nn.Module):
    def __init__(self, hid_dim, embedding_dim, num_layers, input_vocab_size,
                 target_vocab_size, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = num_layers
        self.encoder = Encoder(input_vocab_size, embedding_dim, hid_dim,num_layers,dropout)
        #self.layer = nn.RNN(hid_dim, hid_dim, num_layers, batch_first=True, dropout=dropout)
        self.decoder = Decoder(target_vocab_size, embedding_dim, hid_dim, num_layers, dropout)
        self.final_layer = nn.Linear(hid_dim, target_vocab_size)


    def forward(self, source, target):
      hidden = self.encoder(source)
      output = self.decoder(target, hidden)
      output = self.final_layer(output)
      return(output)


model = base_rnn( hid_dim = 512 ,embedding_dim = 512, num_layers=3,
                 input_vocab_size = content_vocab.n_words,
                 target_vocab_size= title_vocab.n_words,
                  dropout = 0.1).to(device)

print('Batch of title Sentences:', title_seq.shape)
print('Batch of content paragraphs:', content_seq.shape)
print('Output of Causal Self-Attention:', model(content_seq.to(device), title_seq.to(device)).shape)

Batch of title Sentences: torch.Size([1, 5])
Batch of content paragraphs: torch.Size([1, 382])
Output of Causal Self-Attention: torch.Size([1, 5, 3686])


In [15]:
# Counting end of sequence and start of sequence
title_seq_len = max_len_title + 2
content_seq_len = max_len_content + 2

In [16]:
for seq in training_loader:
  print(type(seq))
  break

<class 'list'>


In [17]:
#Training
def train_sequence(seq, model, optimizer):
    loss = nn.CrossEntropyLoss()
    total_loss = 0
    for batch in range(len(seq[0])):
      X = seq[0][batch]
      y = seq[1][batch]

      y_hat = model(X,y)
      l = loss(y_hat, y.long())

      total_loss += l.item()

      optimizer.zero_grad()
      l.backward()
      optimizer.step()
    return total_loss


def fit(model, loader, lr, num_epochs=100):
  optimizer = torch.optim.Adagrad(model.parameters(), lr)
  for epoch in range(num_epochs):
    total_loss = 0
    for sequence in loader:
      total_loss += train_sequence(sequence, model, optimizer)
      total_loss /= len(loader)
    print(f'Epoch {epoch} | Perplexity {np.exp(total_loss):.1f}. Loss: {total_loss:.3f}')


In [18]:
hid_dim = 64
embed_dim = 64
n_layer = 3
batch_size = 20
num_epochs = 100
lr = 0.01
model = base_rnn( hid_dim = hid_dim ,embedding_dim = embed_dim, num_layers=n_layer,
                 input_vocab_size = content_vocab.n_words,
                 target_vocab_size= title_vocab.n_words,
                  dropout = 0.1).to(device)


training_loader, test_data = data_loader(batch_size)

In [19]:
fit(model, training_loader, lr, num_epochs=num_epochs)

Epoch 0 | Perplexity 2.8. Loss: 1.017
Epoch 1 | Perplexity 2.5. Loss: 0.927
Epoch 2 | Perplexity 2.3. Loss: 0.836
Epoch 3 | Perplexity 2.3. Loss: 0.839
Epoch 4 | Perplexity 2.4. Loss: 0.872
Epoch 5 | Perplexity 2.3. Loss: 0.843
Epoch 6 | Perplexity 2.2. Loss: 0.800
Epoch 7 | Perplexity 2.2. Loss: 0.802
Epoch 8 | Perplexity 2.1. Loss: 0.756
Epoch 9 | Perplexity 2.1. Loss: 0.731
Epoch 10 | Perplexity 2.3. Loss: 0.825
Epoch 11 | Perplexity 2.3. Loss: 0.833
Epoch 12 | Perplexity 2.3. Loss: 0.821
Epoch 13 | Perplexity 2.3. Loss: 0.822
Epoch 14 | Perplexity 2.2. Loss: 0.779
Epoch 15 | Perplexity 2.3. Loss: 0.816
Epoch 16 | Perplexity 2.2. Loss: 0.790
Epoch 17 | Perplexity 2.2. Loss: 0.801
Epoch 18 | Perplexity 2.2. Loss: 0.780
Epoch 19 | Perplexity 2.2. Loss: 0.789
Epoch 20 | Perplexity 2.1. Loss: 0.742
Epoch 21 | Perplexity 2.1. Loss: 0.737
Epoch 22 | Perplexity 2.2. Loss: 0.787
Epoch 23 | Perplexity 2.1. Loss: 0.764
Epoch 24 | Perplexity 2.1. Loss: 0.759
Epoch 25 | Perplexity 2.1. Loss: 0.

In [22]:
#Rouge Testing
rouge = evaluate.load('rouge')
#y_pred = 
#results = rouge.compute(predictions=, references=references)