Contributors:
Peter Harmer

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
from google.colab import drive

In [3]:
#Initializing
gpu_available = torch.cuda.is_available()
if gpu_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
#Data Load
 #Change or switch as you need to load your own verson of the file
drive.mount('/content/drive')
# data = pd.read_csv('drive/My Drive/CS_539/bbc-news-data.csv', delimiter='\t')
data = pd.read_csv('drive/My Drive/COMP SCI 539/bbc-news-data.csv', delimiter='\t')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data.head(5)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [6]:
#Data Preprocessing

# Find and remove nulls
print(data.isnull().sum())

# Data to lowercase
data["title"] = data["title"].str.lower()
data["content"] = data["content"].str.lower()
# Remove and replace contractions
    #Find more contraction in text and add
contraction_dict = {"can't":"cannot","didn't":"did not","aren't":"are not","she'd":"she would","he'd":"he would","they'd":"they would","they've":"they have",
"shouldn't":"should not","shouldn't've":"should not have","she'll":"she will","he'll":"he will","they'll":"they will"
}

category    0
filename    0
title       0
content     0
dtype: int64


In [7]:
data.head(5)

Unnamed: 0,category,filename,title,content
0,business,001.txt,ad sales boost time warner profit,quarterly profits at us media giant timewarne...
1,business,002.txt,dollar gains on greenspan speech,the dollar has hit its highest level against ...
2,business,003.txt,yukos unit buyer faces loan claim,the owners of embattled russian oil giant yuk...
3,business,004.txt,high fuel prices hit ba's profits,british airways has blamed high fuel prices f...
4,business,005.txt,pernod takeover talk lifts domecq,shares in uk drinks and food firm allied dome...


In [8]:
#Corpus Creation

def corpus_processing(pandas_dataset, column):
  # Processes the input pandas string columns into a corpus
    # NEEDS: A line to replace contractions
  corpus = [line.strip() for line in pandas_dataset[column] if line.strip()]
  # New Line Here
  corpus = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in corpus]
  corpus = [re.sub(' +', ' ', line) for line in corpus]
  corpus = [word for line in corpus for word in line.split()]
  #corpus =
  return corpus


# Testing on Title column
corpus = corpus_processing(data[data.index == 5],'content')
print(corpus)


vocab_size = len(np.unique(corpus)) + 2
tkn_counter = Counter([word for word in corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
vocab["/UNK"] = len(vocab)
vocab["/PAD"] = len(vocab)
print(len(vocab))
print(vocab_size)

['japan', 's', 'economy', 'teetered', 'on', 'the', 'brink', 'of', 'a', 'technical', 'recession', 'in', 'the', 'three', 'months', 'to', 'september', 'figures', 'show', 'revised', 'figures', 'indicated', 'growth', 'of', 'just', '0', '1', 'and', 'a', 'similar', 'sized', 'contraction', 'in', 'the', 'previous', 'quarter', 'on', 'an', 'annual', 'basis', 'the', 'data', 'suggests', 'annual', 'growth', 'of', 'just', '0', '2', 'suggesting', 'a', 'much', 'more', 'hesitant', 'recovery', 'than', 'had', 'previously', 'been', 'thought', 'a', 'common', 'technical', 'definition', 'of', 'a', 'recession', 'is', 'two', 'successive', 'quarters', 'of', 'negative', 'growth', 'the', 'government', 'was', 'keen', 'to', 'play', 'down', 'the', 'worrying', 'implications', 'of', 'the', 'data', 'i', 'maintain', 'the', 'view', 'that', 'japan', 's', 'economy', 'remains', 'in', 'a', 'minor', 'adjustment', 'phase', 'in', 'an', 'upward', 'climb', 'and', 'we', 'will', 'monitor', 'developments', 'carefully', 'said', 'econo

In [9]:
class CorpusConversion(Dataset):
  def __init__(self, summary_corpus, title_corpus, vocab, max_summary_len=100):
    super().__init__()

    self.max_summary_len = max_summary_len
    self.summary_corpus = summary_corpus
    self.title_corpus = title_corpus
    self.vocab = vocab
    self.inv_vocab = {idx: word for word, idx in self.vocab.items()}

  def convert2idx(self, word_sequence):
    return [self.vocab[word if word in self.vocab else "/UNK"] for word in word_sequence]

  def convert2word(self, idx_sequence):
    return [self.inv_vocab[idx] for idx in idx_sequence]

  def __getitem__(self, idx):
    summary = self.summary_corpus[idx]
    title = self.title_corpus[idx]

    # Convert summary to indices, truncate/pad to max_summary_len
    summary_idx = self.convert2idx(summary.split()[:self.max_summary_len])
    summary_idx += [self.vocab["/PAD"]] * (self.max_summary_len - len(summary_idx))  # Padding
    summary_idx = torch.tensor(summary_idx)

    # Convert title to indices
    title_idx = torch.tensor(self.convert2idx(title.split()))

    return summary_idx, title_idx

  def __len__(self):
    return min(len(self.summary_corpus), len(self.title_corpus))

In [10]:
class Encoder(nn.Module):
  def __init__(self, input_size, output_size, hid_dim, emb_dim, n_layers, dropout):
    super().__init__()

    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_size,emb_dim)

    self.rnn = nn.RNN(input_size, hid_dim, n_layers, batch_first=True, dropout=dropout)

    self.dropout = nn.Dropout(dropout)

  def forward(self, input):
    print(input.shape)
    embedded = self.embedding(input)
    print(input.shape)
    print(input.dtype)
    embedded = self.dropout(input)
    print(embedded.shape)
    outputs, (hid_state, cell_state) = self.rnn(embedded)
    return hid_state , cell_state


class Decoder(nn.Module):
  def __init__(self, input_size, output_size, hid_dim, emb_dim, n_layers, dropout):
    super().__init__()

    self.out_size = output_size
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = self.embedding = nn.Embedding(output_size, emb_dim)

    self.rnn = nn.RNN(input_size, hid_dim, n_layers, batch_first=True, dropout=dropout)
    self.out = nn.Linear(hid_dim, output_size)

    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hid_state, cell_state):

    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))

    output, (hid_state, cell_state) = self.rnn(embedded,(hid_state, cell_state))

    result = self.out(output.squeeze(0))

    return result, hid_state, cell_state



In [11]:
#Recurrent Network Frame


""" Removed For Error Identification
#Encoder(input_size, hid_dim, hid_dim, emb_dim, n_layers, dropout),
      #nn.LSTM(hid_dim, hid_dim, n_layers, batch_first=True, dropout=dropout),
      self.layers = nn.Sequential(
      ,
      ,
      ,
      #nn.LSTM(hid_dim, hid_dim, n_layers, batch_first=True, dropout=dropout),
      #Decoder(hid_dim, output_size, hid_dim, emb_dim, n_layers, dropout),
    )

"""
class base_rnn(nn.Module):
    def __init__(self, input_size, output_size, hid_dim, n_layers, emb_dim, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        #Initial setup
        self.device = device
        self.RNN_one = nn.LSTM(input_size, hid_dim, n_layers, batch_first=True, dropout=dropout).to(self.device)
        self.RNN_two = nn.LSTM(hid_dim, hid_dim, n_layers, batch_first=True, dropout=dropout).to(self.device)
        self.RNN_three = nn.LSTM(hid_dim, output_size, n_layers, batch_first=True, dropout=dropout).to(self.device)

    def forward(self, x):
      #batch_size = len(x)
      x = x.float()
      x = x.to(self.device)
      out, states  = self.RNN_one(x)
      out, states = self.RNN_two(out)
      out, states = self.RNN_three(out)
      return out

In [12]:
#Hyperparameters
dropout = 0.1
hidden_dim = 512
lr= 0.01
n_epochs = 20
summation_length = 10

# Test Values

#Input size
input_size = 10
#Output size
output_size = 10

In [13]:
#Testing for creation errors
model = base_rnn(input_size, output_size, hidden_dim, summation_length, vocab_size, dropout, device)
model

base_rnn(
  (RNN_one): LSTM(10, 512, num_layers=10, batch_first=True, dropout=0.1)
  (RNN_two): LSTM(512, 512, num_layers=10, batch_first=True, dropout=0.1)
  (RNN_three): LSTM(512, 10, num_layers=10, batch_first=True, dropout=0.1)
)

In [14]:
#output testing

#Hyperparameters
dropout = 0.1
hidden_dim = 512
lr= 0.01
n_epochs = 200
summation_length = 10

# test row
test_row = 6

#Input Data
input_corpus = corpus_processing(data, "content")

input_vocab_size = len(input_corpus) + 2
tkn_counter = Counter([word for word in input_corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(input_vocab_size))}
vocab["/UNK"] = len(vocab)
vocab["/PAD"] = len(vocab)


# Output Data
output_corpus = corpus_processing(data, "title")

train_ratio = 0.8
split_idx = int(len(input_corpus) * train_ratio)

train_corpus_sum = input_corpus[:split_idx]
val_corpus_sum = input_corpus[split_idx:]

train_corpus_tar = output_corpus[:split_idx]
val_corpus_tar = output_corpus[split_idx:]


train_dataset = CorpusConversion(train_corpus_sum, train_corpus_tar , vocab, summation_length)
val_dataset = CorpusConversion(val_corpus_sum, train_corpus_tar, vocab, summation_length)

#Target Data
#target_corpus = corpus_processing(data[data.index == test_row], "title")


#vocab_size = len(np.unique(corpus)) + 1
#tkn_counter = Counter([word for word in corpus])
#vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
#vocab["/UNK"] = len(vocab)

In [15]:
input = torch.tensor(train_dataset.convert2idx(data['content'][5]))
input = F.one_hot(input,input_vocab_size).float()
print(input.shape)
#input = input.reshape(187,1, (len(input)))
model = base_rnn(len(input), 10, hidden_dim, summation_length, input_vocab_size, dropout)
result = model(input)

test_sentence = result.argmax(dim=1).tolist()
print(test_sentence[:10])
train_dataset.convert2word(test_sentence[:10])

KeyboardInterrupt: ignored

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

def fit(model, dataloader, optimizer, criterion, epochs, device):
    model.train()
    loss_history = []

    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)


            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # Reshape or align outputs and targets if necessary
            # Example: outputs might need to be reshaped or targets might need to be converted
            # This depends on the specific output format of your model and the criterion requirements
            # loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))

            print(targets.shape, outputs.shape)


            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        loss_history.append(avg_loss)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    return loss_history

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)

            # Same as in fit function, adjust loss calculation based on output and target format
            # loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))


            loss = criterion(outputs, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


# Example usage
# Assuming you have a dataloader for training and validation
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = base_rnn(input_size, output_size, hidden_dim, summation_length, vocab_size, dropout, device) # Your LSTM model
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

epochs = 10
train_loss_history = fit(model, train_dataloader, optimizer, criterion, epochs, device)
val_loss = evaluate(model, val_dataloader, criterion)

# Plotting
plt.plot(train_loss_history, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.legend()
plt.show()


torch.Size([32, 1]) torch.Size([32, 10])


IndexError: ignored