# Classificatie van nieuwsartikelen

In deze notebook gaan we verder werken op de AG-news nieuwsartikelen dataset.
In de vorige notebook hebben we bekeken hoe we tekstuele data kunnen preprocessen.
In deze notebook gaan we classificatie uitvoeren door gebruik te maken van recurrente neurale netwerken.

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from keras_preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import opendatasets as od


od.download("https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset")

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the dataset
def read_csv(filename):
    df = pd.read_csv(filename)
    df.columns = ["label", "title", "description"]
    df["text"] = df['title'] + ' ' + df['description']
    df['label'] = df['label'] - 1
    return df

df_train = read_csv('./ag-news-classification-dataset/train.csv')
display(df_train.head())

df_test = read_csv('./ag-news-classification-dataset/test.csv')

# Parameters
MAX_NUM_WORDS = 20000  # Maximum number of unique words to keep
MAX_SEQUENCE_LENGTH = 50  # Maximum length of input sequences
EMBEDDING_DIM = 32  # Dimensionality of the embedding layer

# Tokenizer
def preprocess(df, tokenizer=None):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        tokenizer.fit_on_texts(df['text'])
        
    sequences = tokenizer.texts_to_sequences(df['text'])
    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    # Labels (one-hot encoding)
    y = to_categorical(df['label'], num_classes=4)

    return X, y, tokenizer

X_train, y_train, tokenizer = preprocess(df_train)
X_test, y_test, _ = preprocess(df_test, tokenizer)

Skipping, found downloaded files in "./ag-news-classification-dataset" (use force=True to force download)
Using device: cuda


Unnamed: 0,label,title,description,text
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,2,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


In [6]:
print(type(X_train)) # X_train is een numpy array, kan perfect gebruikt worden in de analoge functies van pytorch

# Dataset + dataloader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) # shuffle false -> niet trainen dus maar 1 epoch dus is shuffle niet belangrijk

<class 'numpy.ndarray'>


## Opbouwen, trainen en evalueren van een RNN

In [19]:
# RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # bereken de embedding
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True) # verwerk de sequentie (of nn.LSTM of nn.GRU)
        self.fc = nn.Linear(hidden_dim, output_dim) # bereken output uit de rnn-laag

    def forward(self, x):
        x = self.embedding(x)
        x, hidden = self.rnn(x) # voer de recurrente laag uit
        # geen activatiefunctie hier want we werken enkel met de hidden -> die heeft reeds een tanh uitgevoerd
        hidden = hidden.squeeze(0) # laat de eerste dimensie weg
        x = self.fc(hidden) # hidden state van de laatste tijdstap gebruiken voor classficatie
        return x, hidden

hidden_dim = 128
output_dim = 4

model = RNNModel(MAX_NUM_WORDS, EMBEDDING_DIM, 128, 4)
print(model)

RNNModel(
  (embedding): Embedding(20000, 32)
  (rnn): RNN(32, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)


In [20]:
for features,label in train_loader:
    print(features.shape) # (batch_size, sequence_length)
    x, hidden = model(features)
    print(x.shape, hidden.shape)
    break

torch.Size([64, 50])
torch.Size([64, 4]) torch.Size([64, 128])


In [23]:
# Train het Model
criterion = nn.CrossEntropyLoss() # dit moet je nog kunnen beantwoorden voor de tweede type A evaluatie
# hier cross entropy want multi-class classification probleem
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs, _ = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}: loss {running_loss/len(train_loader)}")
print("Done")

Epoch 1/5: loss 0.9162078396002452
Epoch 2/5: loss 0.6060804185390473
Epoch 3/5: loss 0.4600883013486862
Epoch 4/5: loss 0.3739034271876017
Epoch 5/5: loss 0.38062629988193514
Done


In [26]:
# Evalueer het Model

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        
        _, labels = torch.max(labels, 1) # labels heeft shape (64, 4) -> zoek per input in de batch naar de grootste klasse
        outputs, _ = model(inputs)
        _, predicted = torch.max(outputs, 1) # torch.max geeft   max, argmax

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100* correct/total}")

Accuracy: 86.72368421052632


## Oefeningen

* Voeg een extra Linear-laag toe na de RNN-laag. Experimenteer met het aantal neuronen in deze laag en analyseer hoe de prestaties veranderen.
* Pas het model aan om in plaats van een SimpleRNN-laag een LSTM of GRU-laag te gebruiken. Vergelijk de prestaties van de drie typen recurrente netwerken.

In [None]:
# Oefening 1
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # bereken de embedding
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True) # verwerk de sequentie (of nn.LSTM of nn.GRU)
        self.fc = nn.Linear(hidden_dim, 16) # bereken output uit de rnn-laag
        self.fc2 = nn.Linear(16, output_dim) # bereken output uit de rnn-laag

    def forward(self, x):
        x = self.embedding(x)
        x, hidden = self.rnn(x) # voer de recurrente laag uit
        # geen activatiefunctie hier want we werken enkel met de hidden -> die heeft reeds een tanh uitgevoerd
        hidden = hidden.squeeze(0) # laat de eerste dimensie weg
        x = nn.functional.relu(self.fc(hidden)) # hidden state van de laatste tijdstap gebruiken voor classficatie
        x = self.fc2(x)
        return x, hidden

# fc2 toegevoegd
# we hebben ervoor gekozen om fc 16 neuronen breed te maken
# fc2 bevat 4 neuronen
# activatiefunctie toegevoegd in de forward

In [30]:
# Oefening 2

class RNNModel2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # bereken de embedding
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # verwerk de sequentie (of nn.LSTM of nn.GRU)
        self.fc = nn.Linear(hidden_dim, 16) # bereken output uit de rnn-laag
        self.fc2 = nn.Linear(16, output_dim) # bereken output uit de rnn-laag

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, _) = self.rnn(x) # voer de recurrente laag uit
        # geen activatiefunctie hier want we werken enkel met de hidden -> die heeft reeds een tanh uitgevoerd
        hidden = hidden.squeeze(0) # laat de eerste dimensie weg
        x = nn.functional.relu(self.fc(hidden)) # hidden state van de laatste tijdstap gebruiken voor classficatie
        x = self.fc2(x)
        return x, hidden


model = RNNModel2(MAX_NUM_WORDS, EMBEDDING_DIM, 128, 4)

criterion = nn.CrossEntropyLoss() # dit moet je nog kunnen beantwoorden voor de tweede type A evaluatie
# hier cross entropy want multi-class classification probleem
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs, _ = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}: loss {running_loss/len(train_loader)}")
print("Done")

Epoch 1/5: loss 0.6616245873530706
Epoch 2/5: loss 0.34016635310649873
Epoch 3/5: loss 0.2685256950259209
Epoch 4/5: loss 0.22716335246960323
Epoch 5/5: loss 0.19350809991161028
Done


**Oefening 3**

Volg de tutorial op de volgende link: https://www.tensorflow.org/text/tutorials/text_generation
Werk hieronder het gelijkaardige probleem uit maar maak het door gebruik te maken van pytorch in plaats van tensorflow voor het model op te bouwen.
In deze tutorial wordt er tekst gegenereerd die lijkt op tekst geschreven door shakespeare.
Let op dat dit een vereenvoudigde versie is waarbij karakter per karakter wordt gegenereerd en niet woord per woord. Er is dus geen garantie dat er echte woorden gemaakt worden.

In [33]:
import keras
import tensorflow as tf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset
import random

path_to_file = keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')
print(text[:250])
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

# Character to index mapping
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

# TODO: Encodeer elk karakter in tekst naar een nummer, uitkomst is een list ipv een string
encoded_text = [char_to_idx[char] for char in text] 
print(encoded_text[:250])

# TODO: Maak een dataset aan waarbij de tekst (uit voorgaande todo) omzet naar een reeks sequenties
# input 100 aaneensluitende karakters, output is het karakter erop volgende
class TextDataset(Dataset):
    def __init__(self, text, seq_length):
        self.text = text
        self.seq_length = seq_length

    def __len__(self):
        return len(self.text) - self.seq_length # zodat we niet in de laatste karakters kijken -> we kunnen geen volledige sequentie maken

    def __getitem__(self, idx):
        x = self.text[idx:idx+self.seq_length]
        y = self.text[idx+self.seq_length]
        
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.float)

seq_length = 100
dataset = TextDataset(encoded_text, seq_length)

# TODO: indien nodig maak een subset tot 10 of 1% van de dataset
subset_size=int(0.01 * len(dataset))
random_indices = random.sample(range(len(dataset)), subset_size) # selecteer 1% examples/inputs
dataset = Subset(dataset, random_indices)

# Check a single example
sample_x, sample_y = dataset[0]
print("Input (x):", sample_x)
print("Target (y):", sample_y)
print("Decoded Input:", ''.join(idx_to_char[idx] for idx in sample_x.numpy()))
print("Decoded Target:", idx_to_char[sample_y.item()])
print('Rows', len(dataset))

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

65 unique characters
[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47,

In [39]:
# TODO: Maak een rnn model bestaande uit een embedding layer, gru layer en linear layer
# Maak het mogelijk om aan de forward funtie een parameter toe te voegen om ook de hidden state terug te geven en om de hidden state mee te geven voor de gru laag
# 
vocab_size = len(idx_to_char)
print(vocab_size)
embedding_dim = 50
rnn_units = 60

class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # bereken de embedding
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True) # verwerk de sequentie (of nn.LSTM of nn.GRU)
        self.fc = nn.Linear(hidden_dim, vocab_size) # bereken output uit de rnn-laag

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is None:
            x, hidden = self.rnn(x) # voer de recurrente laag uit
        else:
            x, hidden = self.rnn(x, hidden)
        x = self.fc(hidden.squeeze(0)) # hidden state van de laatste tijdstap gebruiken voor classficatie
        return x, hidden

shakespeare = ShakespeareModel(vocab_size, embedding_dim, rnn_units)

# RNN vervangen door GRU
# output_dim = vocab_size
# in de forward functie de hidden meegeven met de GRU-laag
# de squeeze niet uitvoeren op de hidden die in de return zit

65


In [40]:
# test 1 sample om door het model te sturen
# kijk of je dimensies correct aan elkaar gekoppeld zijn
for input_example_batch, target_example_batch in dataset:
    print(input_example_batch.shape)
    example_batch_predictions, _ = shakespeare(input_example_batch.unsqueeze(0))
    print(example_batch_predictions.shape, "# (sequence_length, vocab_size), 100 char as input")
    break

torch.Size([100])
torch.Size([1, 65]) # (sequence_length, vocab_size), 100 char as input


In [43]:
from torch.utils.data import DataLoader
import torch.optim as optim
import os
import math

batch_size = 64
seq_length = 100
epochs = 5
vocab_size = len(vocab)
embedding_dim = 50
rnn_units = 60

shakespeare = ShakespeareModel(
    vocab_size,
    embedding_dim,
    rnn_units)

# TODO: train het rnn model
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(shakespeare.parameters(), lr=0.001)

# Set the path to save/load the model
model_path = "rnns.pth"

# Check if the model file exists
if os.path.exists(model_path):
    print("Loading existing model...")
    shakespeare.load_state_dict(torch.load(model_path))
    shakespeare.eval()  # Set model to evaluation mode if only inference is required
else:
    # Training loop
    for epoch in range(epochs):
        shakespeare.train()
        total_loss = 0
        
        for batch, (inputs, targets) in enumerate(dataloader):
            # Zero the gradient
            optimizer.zero_grad()

            # Forward pass
            outputs = shakespeare(inputs)  # Shape: (batch_size, seq_length, vocab_size)
            
            # Reshape outputs and targets for loss computation
            outputs = outputs.view(-1, vocab_size)  # Shape: (batch_size * seq_length, vocab_size)
            targets = targets.view(-1)  # Shape: (batch_size * seq_length)
            
            # Compute the loss
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            if batch % int((len(dataloader)/10)) == 0:
                print(f"Epoch {epoch + 1}/{epochs}: {math.floor(batch/len(dataloader)*100)}")
        
        # Print epoch loss
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}%")
    
    # Save the model after training
    torch.save(shakespeare.state_dict(), model_path)
    print(f"Model saved to {model_path}")

Loading existing model...


RuntimeError: Error(s) in loading state_dict for ShakespeareModel:
	Missing key(s) in state_dict: "rnn.weight_ih_l0", "rnn.weight_hh_l0", "rnn.bias_ih_l0", "rnn.bias_hh_l0", "fc.weight", "fc.bias". 
	Unexpected key(s) in state_dict: "gru.weight_ih_l0", "gru.weight_hh_l0", "gru.bias_ih_l0", "gru.bias_hh_l0", "output.weight", "output.bias". 

In [None]:
import torch.nn.functional as F

def generate_text(model, start_string, char_to_idx, idx_to_char, vocab_size, generation_length=100, temperature=1.0):
    model.eval()  # Set model to evaluation mode
    
    # Convert start_string to indices
    input_indices = torch.tensor([char_to_idx[char] for char in start_string], dtype=torch.long).unsqueeze(0)
    
    generated_text = start_string
    states = None  # Initial state (None means it will be initialized automatically)
    
    for _ in range(generation_length):
        # Genereer opeenvolgend nieuwe tokens
        with torch.no_grad():
            outputs, states = model(input_indices, states)

        logits = outputs
        logits = logits/temperature

        probabilities = F.softmax(logits, dim=1) # bepaal de kans voor elk karakter
        next_index = torch.multinomial(proabilities, num_samples=1).item() # neem 1 karaketer op basis van bovenstaande kansen

        generated_text += idx_to_char[next_index]
        input_indices = torch.tensor([[next_index]], dtype=torch.long)
    
    return generated_text


In [None]:
# Example start string and generation parameters
start_string = "ROMEO: "
generation_length = 200
temperature = 0.8

# Generate text
generated_text = generate_text(
    model=shakespeare,
    start_string=start_string,
    char_to_idx=char_to_idx,
    idx_to_char=idx_to_char,
    vocab_size=vocab_size,
    generation_length=generation_length,
    temperature=temperature
)

print("Generated Text:")
print(generated_text)
