In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# The Task
Train several models of recurrent neural networks, for example LSTM, GRU, Bidirectional-LSTM.
Calculate the value of the metric that you proposed in Part 1 and compare the results for different RN, heuristics, and classical ML.

### Read the preprocessed data

In [None]:
data = pd.read_parquet('/kaggle/input/normalized-tweets/Tweets.parquet', engine='pyarrow')

### Check the loaded data

In [None]:
data.info()

In [None]:
data.sample(10)

Everethig looks fine, so we can start to train our models.

# Sinmple Linear Model
Let's start with  simple linear model with pytorch.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import get_tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Check device, and save the information about it

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Tokenization and text preprocessing

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(tokens)

## Apply preprocessing to the 'Text' column

In [None]:
data['Processed_Text'] = data['Normalized_Text'].apply(preprocess_text)

## Encode labels using LabelEncoder

In [None]:
label_encoder = LabelEncoder()
data['Sentiment_encoded'] = label_encoder.fit_transform(data['Sentiment'])

## Split the data into train, test, and validation sets

In [None]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42)


## Vectorize the text data

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(train_df['Normalized_Text'])
X_test_vec = vectorizer.transform(test_df['Normalized_Text'])
X_valid_vec = vectorizer.transform(valid_df['Normalized_Text'])

## Convert data to PyTorch tensors

In [None]:
X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.float32).to(device)
X_valid_tensor = torch.tensor(X_valid_vec.toarray(), dtype=torch.float32).to(device)



## Convert labels to PyTorch tensors

In [None]:
y_train_tensor = torch.tensor(train_df['Sentiment_encoded'].values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(test_df['Sentiment_encoded'].values, dtype=torch.long).to(device)
y_valid_tensor = torch.tensor(valid_df['Sentiment_encoded'].values, dtype=torch.long).to(device)

In [None]:
output_dim = len(data['Sentiment_encoded'].unique())
print(output_dim)

## Instantiate the model with dropout

In [None]:
class MultiLabelModelWithDropout(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.5):
        super(MultiLabelModelWithDropout, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(256, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x  # No sigmoid activation here

### Instantiate the model with dropout

In [None]:
model = MultiLabelModelWithDropout(input_dim=X_train_tensor.shape[1], output_dim=output_dim, dropout_rate=0.5)
model.to(device)

### Define the loss function and optimizer

In [None]:
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Experiment with different learning rates

## Training loop with dropout and validation

In [None]:
num_epochs = 180
train_losses = []
valid_losses = []

for epoch in range(num_epochs):
    # Training
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    
    # Add L2 regularization
    l2_reg = 0.001
    for param in model.parameters():
        loss += l2_reg * torch.sum(param.pow(2))
    
    loss.backward()
    optimizer.step()
    
    # Append the training loss for visualization
    train_losses.append(loss.item())

    # Validation
    model.eval()
    with torch.no_grad():
        predictions_valid = model(X_valid_tensor)
        loss_valid = criterion(predictions_valid, y_valid_tensor)
        valid_losses.append(loss_valid.item())
    if epoch % 10 == 0:
        # Print and visualize the training and validation losses
        print(f"Epoch [{epoch + 1} / {num_epochs}], Train Loss: {loss.item():.4f}, Valid Loss: {loss_valid.item():.4f}")
        torch.cuda.empty_cache()


### Plot the training and validation losses

In [None]:

plt.plot(train_losses, label='Training Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.show()

### Evaluation loop and metric calculation on the test set

In [None]:

model.eval()
with torch.no_grad():
    predictions_test = model(X_test_tensor)
    probabilities_test = F.softmax(model(X_test_tensor), dim=1)
    _, predicted_labels_test = torch.max(probabilities_test, 1)


### Ensure tensors are on the same device

In [None]:
probabilities_test = probabilities_test.to(y_test_tensor.device)
predicted_labels_test = predicted_labels_test.to(y_test_tensor.device)


## Calculate metrics using torchmetrics

In [None]:
import torchmetrics
from torchmetrics import F1Score as F1


roc_auc = torchmetrics.functional.classification.auroc(probabilities_test, y_test_tensor, average='macro', task='multiclass', num_classes=output_dim)


f1_metric = f1 = F1(average='macro', task='multiclass', num_classes=output_dim).to(device)
f1_test = f1_metric(predicted_labels_test, y_test_tensor)

print(f"Test ROC AUC: {roc_auc:.4f}, Test F1 Score: {f1_test:.4f}")

## Conclusion:

The result is almost the same as using classical models.

I've tried a different epochs quatity and stopped an 180 epochs with lr = 0.01