In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from utils import *
from preprocessing import *
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  classification_report

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\urbi1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# RNN Model Definition

In [3]:
class RNN(nn.Module):
    def __init__(self, vocabulary_size, num_classes, embedding_dim, hidden_dim, n_layers, dropout=0.5,weight_decay = 1e-5):
        """
        Args:
        vocabulary_size (int): The size of the vocabulary (number of unique tokens in the input text).
        num_classes (int): The number of output classes (labels).
        embedding_dim (int): The dimension of the word embeddings (the vector size representing each word).
        hidden_dim (int): The number of units in the hidden state of the RNN.
        n_layers (int): The number of RNN layers to stack.
        dropout (float, optional): The probability for dropout regularization (default is 0.5).
        """
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.weight_decay = weight_decay 

    def forward(self, x):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded)
        out = self.dropout(out)
        out = self.fc(out[:, -1])
        return out

# Yangswei_85

To use the Yangswei_85 dataset, run the cell below.

In [4]:
# Load training set
df_train = pd.read_csv('data/train_yangswei_85.csv')  
# Load test set
test_df = pd.read_csv('data/test_yangswei_85.csv')
dataset_name = 'Yangswei_85'

# T5 
To use the T5 dataset, run the cell below.

In [5]:
# Load training set
df_train = pd.read_csv('data/train_t5.csv')  
# Load test set
test_df = pd.read_csv('data/test_t5.csv')
dataset_name = 'T5'

# Training and Validation

## Training Data Preprocessing

In [6]:
#Preprocess training data
df_train['text'].apply(preprocess).to_frame()

# Tokenize and pad training data
padded_sequences, train_vocabulary, vocab_size = tokenize_and_pad(df_train[['text']])

# Set training data
train_data = padded_sequences

#Encode labels and save classes in npy file
label_encoder = LabelEncoder()
label_encoder.fit(df_train['label'])
np.save('data/label_classes.npy', label_encoder.classes_)
train_labels = torch.tensor(label_encoder.transform(df_train['label']),dtype=torch.long)

## Tune Hyperparameters

In [15]:
#Set fixed model parameters
epochs = 500
patience = 100
num_classes = len(label_encoder.classes_)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_args = {
    'vocabulary_size': vocab_size,
    'num_classes': num_classes   
}

# Set the hyperparameters to tune
param_grid = {
    'embedding_dim': [300],  #200,300
    'hidden_dim': [5], #128,256
    'n_layers': [1], #1,2
    'dropout': [0.7],  #0.4, 0.5, 0.6
    'learning_rate': [0.001],  #0.0001, 0.001, 0.01
    'weight_decay': [1e-3]
}

# Tune the model hyperparameters
best_params, best_accuracy = tune_hyperparams(RNN, model_args, train_data, train_labels, param_grid, epochs, device, patience)
print(f"Best Parameters: {best_params}")
print(f"Best Accuracy: {best_accuracy}")

# Set the best hyperparameters
embedding_dim, hidden_dim, n_layers, dropout, learning_rate = best_params.values()

Fold 1/5, Params: {'embedding_dim': 300, 'hidden_dim': 5, 'n_layers': 1, 'dropout': 0.7, 'learning_rate': 0.001, 'weight_decay': 0.001}


TypeError: RNN.__init__() got an unexpected keyword argument 'weight_decay'

## Set model with best hyperparameters

In [None]:
rnn_model = RNN(vocab_size, num_classes, embedding_dim, hidden_dim, n_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)

## Train and Validate best model

In [None]:
print("Start Training and Validation:\n")
# Split training data into training and validation sets
train_loader, val_loader = train_val_split(train_data, train_labels)

# Train and validate best model
train_losses, val_losses, val_accs, model = train_and_validate(
    rnn_model, optimizer, criterion, train_loader, val_loader, epochs, device
)

# Plot training and validation losses
plot_losses("RNN", dataset_name, train_losses, val_losses, epochs)


In [None]:
# Save best model configuration
model_config = {
    "vocab_size": vocab_size,
    "num_classes": num_classes,
    "embedding_dim": embedding_dim,
    "hidden_dim": hidden_dim,
    "n_layers": n_layers,
    "dropout": dropout,
    "state_dict": model.state_dict()
}

torch.save(model_config, "models/RNN.pth")

# Test

## Test Data Preprocessing

In [None]:
# Preprocess test data
test_df['text'].apply(preprocess).to_frame()  

# Tokenize and pad test data
padded_sequences, _, _ = tokenize_and_pad(test_df[['text']], train_vocabulary)

# Set test data
test_data = padded_sequences

#Encode test labels by loading encoder used for training labels
label_classes = np.load('data/label_classes.npy', allow_pickle=True)
label_encoder = LabelEncoder()
label_encoder.classes_ = label_classes
test_labels = torch.tensor(label_encoder.transform(test_df['label']))

## Test best model on test data

In [None]:
# Wrap the test data in a DataLoader
test_loader = DataLoader(TensorDataset(test_data, test_labels), batch_size=128, shuffle=False)

#Load trained model
model_config = torch.load("models/RNN.pth")
model = RNN(model_config["vocab_size"], model_config["num_classes"], model_config["embedding_dim"], model_config["hidden_dim"],model_config["n_layers"], model_config["dropout"])  
model.load_state_dict(model_config["state_dict"]) 
model.to(device)

#Test model on test set
predictions, true_labels = test(model, test_loader, device)

## Compute metrics on model performance

In [None]:

# Compute metrics
metrics = compute_metrics(predictions, true_labels)
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print('\n')
print('Macro Metrics')
print(f"Macro Precision: {metrics['precision']:.4f}")
print(f"Macro Recall: {metrics['recall']:.4f}")
print(f"Macro F1 Score: {metrics['f1']:.4f}")
print('\n')
print('Weighted Metrics')
print(f"Weighted Precision: {metrics['precision_weighted']:.4f}")
print(f"Weighted Recall: {metrics['recall_weighted']:.4f}")
print(f"Weighted F1 Score: {metrics['f1_weighted']:.4f}")

# Plot confusion matrix
plot_confusion_matrix(true_labels, predictions, label_classes)

In [None]:

# Print classification report
print('Classification Report:\n')
print(classification_report(true_labels, predictions, target_names=label_classes))
