# Classificatie van nieuwsartikelen

In deze notebook gaan we verder werken op de AG-news nieuwsartikelen dataset.
In de vorige notebook hebben we bekeken hoe we tekstuele data kunnen preprocessen.
In deze notebook gaan we classificatie uitvoeren door gebruik te maken van recurrente neurale netwerken.

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from collections import Counter
import re
import kagglehub

# Parameters
MAX_NUM_WORDS = 20000  # Maximum number of unique words to keep
MAX_SEQUENCE_LENGTH = 100  # Maximum length of input sequences
EMBEDDING_DIM = 50  # Dimensionality of the embedding layer

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device='cpu'
print(f"Using device: {device}")

# get the dataset
path = kagglehub.dataset_download("amananandrai/ag-news-classification-dataset")

# Load the dataset
def read_csv(filename):
    df = pd.read_csv(filename)
    df.columns = ["label", "title", "description"]
    df["text"] = df['title'] + ' ' + df['description']
    df['label'] = df['label'] - 1
    return df

df_train = read_csv(f'{path}/train.csv')
df_test = read_csv(f'{path}/test.csv')
display(df_train.head())

# tokenizer
def simple_tokenizer(text):
    return re.findall(r"\b\w+\b", text.lower())
    
# Build vocab from training set
counter = Counter()
for text in df_train["text"]:
    counter.update(simple_tokenizer(text)) # splits de tekst kolom in woorden, tel hoeveel keer elk woord voorkomt

vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common(MAX_NUM_WORDS))}
vocab["<PAD>"] = 0 # special token for padding
vocab["<UNK>"] = 1 # special token for unknown words

print("Size of vocab:", len(vocab))

def encode(text, vocab, max_len=MAX_SEQUENCE_LENGTH):
    tokens = simple_tokenizer(text) # splits text in woorden
    ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens] # zet elk woord om naar een token (unknown als het niet bestaat)
    ids = ids[:max_len] # truncate de text
    ids += [vocab["<PAD>"]] * (max_len - len(ids)) # voeg padding toe als de tekst te kort is
    return ids

num_samples = 30000
X_train = [encode(text, vocab) for text in df_train["description"][:num_samples]]  # neem subset voor demo
y_train = df_train["label"][:num_samples]
X_train = torch.tensor(X_train)
print("X_train shape:", X_train.shape)
print('Highest token idx train', torch.max(X_train))

X_test = [encode(text, vocab) for text in df_test["description"][:num_samples]]  # neem subset voor demo
X_test = torch.tensor(X_test)
y_test = df_test["label"][:num_samples]
print('Highest token idx test', torch.max(X_test))

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Using device: cuda


Unnamed: 0,label,title,description,text
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,2,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


Size of vocab: 20002
X_train shape: torch.Size([30000, 100])
Highest token idx train tensor(20001)
Highest token idx test tensor(19998)


## Opbouwen, trainen en evalueren van een RNN

In [11]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        #print(x.shape, torch.max(x))
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded) # 1 laag met rnn-cellen (woord per woord wordt hierdoor gestuurd)
        last_output = hidden[-1]
        out = self.fc(last_output)
        return out

# Parameters
hidden_dim = 128
output_dim = 4  # AG-News heeft 4 klassen

model = RNNModel(MAX_NUM_WORDS+2, EMBEDDING_DIM, hidden_dim, output_dim)
print(model)

texts, labels = next(iter(train_loader))
print(texts.shape, labels.shape)

RNNModel(
  (embedding): Embedding(20002, 50)
  (rnn): RNN(50, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)
torch.Size([64, 100]) torch.Size([64])


  return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [13]:

# Definieer de Loss en Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

# Train het Model
num_epochs = 2
model.train()
print("Training started.")

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        #inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete.")

Training started.


  return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


Epoch [1/2], Loss: 1.3925
Epoch [2/2], Loss: 1.3940
Training complete.


In [14]:
# Evalueer het Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        predicted = torch.argmax(outputs, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

# Opslaan van het Model
torch.save(model.state_dict(), 'rnn_ag_news_model.pth')

  return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


Test Accuracy: 24.46%


## Oefeningen

* Voeg een extra Dense-laag toe na de RNN-laag. Experimenteer met het aantal neuronen in deze laag en analyseer hoe de prestaties veranderen.
* Pas het model aan om in plaats van een basis RNN-laag een LSTM of GRU-laag te gebruiken. Vergelijk de prestaties van de drie typen recurrente netwerken.

In [None]:
# Oefening 1

In [None]:
# Oefening 2

**Oefening 3**

Volg de tutorial op de volgende link: https://www.tensorflow.org/text/tutorials/text_generation
Werk hieronder het gelijkaardige probleem uit maar maak het door gebruik te maken van pytorch in plaats van tensorflow voor het model op te bouwen.
In deze tutorial wordt er tekst gegenereerd die lijkt op tekst geschreven door shakespeare.
Let op dat dit een vereenvoudigde versie is waarbij karakter per karakter wordt gegenereerd en niet woord per woord. Er is dus geen garantie dat er echte woorden gemaakt worden.

In [None]:
import keras_core as keras
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path_to_file = keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')
print(text[:250])
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

# Character to index mapping
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

# Encode the text as integers
encoded_text = [char_to_idx[char] for char in text]

class TextDataset(Dataset):
    def __init__(self, encoded_text, seq_length):
        self.encoded_text = encoded_text
        self.seq_length = seq_length
    
    def __len__(self):
        # Total sequences in the dataset
        return len(self.encoded_text) - self.seq_length-1

    def __getitem__(self, idx):
        # Input sequence
        x = self.encoded_text[idx:idx + self.seq_length]
        # Target (next character)
        y = self.encoded_text[idx + 1: idx + self.seq_length+1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

# Example usage
seq_length = 100
dataset = TextDataset(encoded_text, seq_length)


subset_size = int(0.1 * len(dataset))
random_indices = random.sample(range(len(dataset)), subset_size)
dataset = Subset(dataset, random_indices)

# Check a single example
sample_x, sample_y = dataset[0]
print("Input (x):", sample_x)
print("Target (y):", sample_y)
print("Decoded Input:", ''.join(idx_to_char[idx] for idx in sample_x.numpy()))
print("Decoded Target:", ''.join([idx_to_char[x] for x in sample_x.numpy()]))
print('Rows', len(dataset))

In [None]:
vocab_size = len(idx_to_char)
embedding_dim = 50
rnn_units = 150

class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, rnn_units, batch_first=True)
        self.output = nn.Linear(rnn_units, vocab_size)

    def forward(self, inputs, states=None, return_state=False):
        # Embedding layer
        x = self.embedding(inputs)
        
        # GRU layer
        if states is None:
            x, states = self.gru(x)
        else:
            x, states = self.gru(x, states)
        
        # Dense layer
        x = self.output(x)
        
        return (x, states[:, -1:, :]) if return_state else x

shakespeare = ShakespeareModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset:
    print(input_example_batch.shape)
    example_batch_predictions, states = shakespeare(input_example_batch.unsqueeze(0), return_state=True)
    print(example_batch_predictions.shape, "# (sequence_length, vocab_size), 100 char as input")
    print(states.shape)
    break

In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim
import os
import math

batch_size = 64
seq_length = 100
epochs = 5


# Create the dataset and DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, loss function, and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(shakespeare.parameters(), lr=0.005)

# Set the path to save/load the model
model_path = "rnns.pth"

# Check if the model file exists
if os.path.exists(model_path):
    print("Loading existing model...")
    shakespeare.load_state_dict(torch.load(model_path))
    shakespeare.eval()  # Set model to evaluation mode if only inference is required
    shakespeare = shakespeare.to(device)
    print("Model loaded...")
else:
    shakespeare = ShakespeareModel(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        rnn_units=rnn_units).to(device)
    # Training loop
    for epoch in range(epochs):
        shakespeare.train()
        total_loss = 0
        
        for batch, (inputs, targets) in enumerate(dataloader):
            # Move data to the appropriate device (CPU/GPU)
            inputs, targets = inputs.to(device), targets.to(device)
            
            
            # Zero the gradient
            optimizer.zero_grad()

            # Forward pass
            outputs = shakespeare(inputs)  # Shape: (batch_size, seq_length, vocab_size)
            # Reshape outputs and targets for loss computation
            outputs = outputs.view(-1, vocab_size)  # Shape: (batch_size * seq_length, vocab_size)
            targets = targets.view(-1)  # Shape: (batch_size * seq_length)
             
            # Compute the loss
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            if batch % int((len(dataloader)/10)) == 0:
                print(f"Epoch {epoch + 1}/{epochs}: {math.floor(batch/len(dataloader)*100)}")
        
        # Print epoch loss
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
    
    # Save the model after training
    torch.save(shakespeare.state_dict(), model_path)
    print(f"Model saved to {model_path}")

In [None]:
def generate_text(model, start_string, char_to_idx, idx_to_char, vocab_size, generation_length=100, temperature=1.0):
    model.eval()
    input_indices = torch.tensor([char_to_idx[char] for char in start_string], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = start_string
    
    # Prime hidden state with the start string
    with torch.no_grad():
        outputs, states = model(input_indices, states=None, return_state=True)

    # Start with the last character
    input_indices = input_indices[:, -1:]
    print(states.shape)

    for _ in range(generation_length):
        with torch.no_grad():
            outputs, states = model(input_indices, states=states, return_state=True)

        # take only the last timestep
        logits = outputs[:, -1, :]  # (1, vocab_size)

        # om te bepalen hoe random je model is (moet je niet kennen voor theorie/praktijk)
        logits = logits / temperature
        probabilities = F.softmax(logits, dim=-1)

        next_index = torch.multinomial(probabilities.squeeze(), num_samples=1)
        generated_text += idx_to_char[next_index.item()]

        # feed next char back in
        input_indices = next_index.unsqueeze(0).to(device)

    return generated_text


In [None]:
# Example start string and generation parameters
start_string = "ROMEO: "
generation_length = 200
temperature = 0.8

# Generate text
generated_text = generate_text(
    model=shakespeare,
    start_string=start_string,
    char_to_idx=char_to_idx,
    idx_to_char=idx_to_char,
    vocab_size=vocab_size,
    generation_length=generation_length,
    temperature=temperature
)

print("Generated Text:")
print(generated_text)
