In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Set the seed for reproducability

In [None]:
torch.manual_seed(42)

# Defining functions for processing text

In [1]:
def simple_tokenizer(text):
  """
  Simple tokenizer that lowercases and splits on whitespace.
  So if we input "Hello World!" it will return ["hello", "world!"]
  """
  return text.lower().split()

In [2]:
def build_vocab(texts, min_freq=1):
  """
  Build vocabulary on a list/series of text
  This allows us to convert text into a series of numbers
  This is an easier data that can be read by the model for patterns
  So if we input ["Hello world", "AI is amazing", "AI for the future"]
  It will return {"<pad>": 0, "<unk>": 1, "hello": 2, "world": 3, "ai": 4, "is": 5, "amazing": 6, "for": 7, "the": 8, "future": 9}
  """
  vocab = {"<pad>": 0, "<unk>": 1}
  word_counts = {}
  for text in texts:
      tokens = simple_tokenizer(text)
      for token in tokens:
          word_counts[token] = word_counts.get(token, 0) + 1
  for token, count in word_counts.items():
      if count >= min_freq and token not in vocab:
          vocab[token] = len(vocab)
  return vocab

In [3]:
def text_to_sequence(text, vocab):
  """
  Convert text to a sequence of numbers
  Using the vocab dictionary created by the build_vocab function
  We can convert the text into a series of numbers by mapping each word to its number
  Also if we encounter a word that is not in the vocab, we map it to the <unk> token
  So if we input "Hello Bob World" and the vocab is {"<pad>": 0, "<unk>": 1, "hello": 2, "world": 3}
  It will return [2, 1, 3]
  """
  tokens = simple_tokenizer(text)
  sequence = [vocab.get(token, vocab["<unk>"]) for token in tokens]
  return sequence

In [4]:
def pad_sequence(sequence, max_length):
  """
  This will ensure that all sequences are the same length
  By padding the shorter sequences with the <pad> token
  since models require consistent sizes of input
  So if the input sequences are [1, 2, 3] and the max length is 5
  It will return [1, 2, 3, 0, 0]

  """
  if len(sequence) < max_length:
      sequence += [0] * (max_length - len(sequence))
  else:
      sequence = sequence[:max_length]
  return sequence

# Create Dataset Class

In [None]:
class TweetDataset(Dataset):
  """
  Custom Pytorch dataset class for the tweet data
  Converts all the text data into something that can be read by the model
  This also converts the dataframe into a Pytorch tensor
  """
  def __init__(self, texts, labels, vocab, max_len):
    self.texts = texts
    self.labels = labels
    self.vocab = vocab
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    # If texts/labels are pandas Series, use .iloc for proper indexing.
    text_item = self.texts.iloc[idx] if isinstance(self.texts, pd.Series) else self.texts[idx]
    label_item = self.labels.iloc[idx] if isinstance(self.labels, pd.Series) else self.labels[idx]
    sequence = text_to_sequence(text_item, self.vocab)
    padded_sequence = pad_sequence(sequence, self.max_len)
    return torch.tensor(padded_sequence, dtype=torch.long), torch.tensor(label_item, dtype=torch.long)

# Design the Model

In [None]:
class Binary_Classifier(nn.Module):
  """
  This will be an LSTM-based binary classifier

  Architecture:
  - Embedding layer: This will convert the input sequence of numbers into a sequence of vectors
  - LSTM layer: This will learn the patterns in the input sequence
  - Linear layer: This will map the LSTM output to a single output
  - Dropout layer: This will help prevent overfitting by randomly dropping out some of the neurons during training

  """
  def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes=2):
    super(Binary_Classifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(hidden_dim * 2, num_classes)
    self.dropout = nn.Dropout(p=0.6)

  def forward(self, x):
    x = self.embedding(x)
    x, _ = self.lstm(x)
    x = self.dropout(x[:, -1, :])
    logits = self.fc(x)
    return logits

# Design the training function

In [None]:
def train_model(model, train_loader, val_loader, epochs, criterion, optimizer):
  """
  Train the model and evaluate on the validation set after each epoch.
    
  Args:
  - model (nn.Module): The LSTM classifier.
  - train_loader (DataLoader): DataLoader for training data.
  - val_loader (DataLoader): DataLoader for validation data.
  - epochs (int): Number of training epochs.
  - criterion: Loss function.
  - optimizer: Optimization algorithm.

  """
  val_losses = []
  train_losses = []

  for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for input_ids, labels in train_loader:
      optimizer.zero_grad()
      outputs = model(input_ids)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f'Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}')
        
    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      for input_ids, labels in val_loader:
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f'Epoch {epoch+1}/{epochs} - Validation Loss: {avg_val_loss:.4f}')
  
  # Return the results of training
  return train_losses, val_losses

# Import the Data

In [None]:
valid_df = pd.read_csv("valid.tsv", sep='\t', names=["Id", "Text", "Label"])
train_df = pd.read_csv("train.tsv", sep='\t', names=["Id", "Text", "Label"])
test_df = pd.read_csv("test.tsv", sep='\t', names=["Id", "Text", "Label"])
noisy_df = pd.read_csv("unlabeled_test_with_noise.tsv", sep='\t', names=["Id", "Text", "Label"])

In [None]:
# Some cleaning of train data so model trains properly
train_df.drop(index=train_df.index[0], axis=0, inplace=True)
train_df.reset_index(drop=True, inplace=True)

# Initial Data Processing

In [None]:
# Map the string label to binary values of 0 and 1
label_mapping = {"UNINFORMATIVE": 0, "INFORMATIVE": 1}

In [None]:
# apply the label mapping to all the datasets
valid_df["Label"] = valid_df["Label"].map(label_mapping)
noisy_df["Label"] = noisy_df["Label"].map(label_mapping)
train_df["Label"] = train_df["Label"].map(label_mapping)
test_df["Label"] = test_df["Label"].map(label_mapping)

In [None]:
# Build vocabulary based on training texts
vocab = build_vocab(train_df["Text"], 2)

In [None]:
# Define maximum token sequence length (adjust as needed)
max_len = 45

In [None]:
# Separate texts and labels from dataframes
train_texts, train_labels = train_df["Text"], train_df["Label"]
valid_texts, valid_labels = valid_df["Text"], valid_df["Label"]

# Create Datasets and Dataloaders

In [None]:
# Create PyTorch Datasets and DataLoaders
train_dataset = TweetDataset(train_texts, train_labels, vocab, max_len)
valid_dataset = TweetDataset(valid_texts, valid_labels, vocab, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)

# Initialize and Train the Model

In [None]:
# Initialize the model, loss function, and optimizer
vocab_size = len(vocab)
embed_dim = 75
hidden_dim = 96
model = Binary_Classifier(vocab_size, embed_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
# Start training for a specified number of epochs
epochs = 10
train_losses, val_losses = train_model(model, train_loader, valid_loader, epochs, criterion, optimizer)

# Plot the losses

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Test the Model

In [None]:
# Build the test dataset using our dataset class
# Create a copy of test dataframe for consistentcy
result_test_df = test_df.copy()

max_len = 45  # must match what was used during training
test_texts, test_labels = result_test_df["Text"], result_test_df["Label"]

# Create the dataset and loader
test_dataset = TweetDataset(test_texts, test_labels, vocab, max_len)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Predictions
import torch
import numpy as np

# Put the model in evaluation mode
model.eval()

# Container for predictions
all_preds = []

with torch.no_grad():
  for input_ids, _ in test_loader:
    outputs = model(input_ids)  # outputs of shape [batch_size, num_classes]
    # Convert logits to predicted labels by taking argmax along the logits dimension
    preds = torch.argmax(outputs, dim=1)
    all_preds.extend(preds.cpu().numpy())

# Convert predictions to a numpy array (just in case)
all_preds = np.array(all_preds)

In [None]:
# Create and output the dataframe

# Add predictions to your DataFrame. You can also convert numbers back to string labels if needed.
result_test_df["predicted Label"] = all_preds

# Map the predictions back to string labels:
reverse_label_mapping = {0: "UNINFORMATIVE", 1: "INFORMATIVE"}
result_test_df["predicted Label"] = result_test_df["predicted Label"].map(reverse_label_mapping)
result_test_df["Label"] = result_test_df["Label"].map(reverse_label_mapping)

In [None]:
result_test_df

# Accuracy check on test

In [None]:
different_rows = len(result_test_df[result_test_df["Label"] != result_test_df["predicted Label"]])
total_rows = len(result_test_df)
print(f"Number of different rows: {different_rows}")
print(f"Total number of rows: {total_rows}")
print(f"Model Accuracy: {100 * (1 - different_rows / total_rows)} %")