In [1]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission.csv to sample_submission (2).csv
Saving test.csv to test (2).csv
Saving train.csv to train (2).csv


In [3]:
import torch
import numpy as np
import pandas as pd
from torch import nn
from transformers import Trainer, TrainingArguments, AutoTokenizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [2]:
pip install datasets emoji



In [4]:
# Step 1: Load and clean the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [5]:
# Define a function to clean the tweets
import re
import emoji

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = emoji.demojize(tweet, delimiters=(" ", " "))
    return tweet

train_df['Cleaned_Tweet'] = train_df['Tweet'].apply(clean_tweet)
test_df['Cleaned_Tweet'] = test_df['Tweet'].apply(clean_tweet)

In [6]:
# Step 2: Split the train dataset into training and validation sets
label_columns = [col for col in train_df.columns if col not in ['ID', 'Tweet', 'Cleaned_Tweet']]

X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['Cleaned_Tweet'], train_df[label_columns], test_size=0.2, random_state=42
)

# Convert y_train and y_valid to numpy arrays
y_train_array = np.array(y_train)
y_valid_array = np.array(y_valid)

# Placeholder labels for the testset (since test labels are unknown)
num_labels = len(label_columns)
test_labels_placeholder = np.zeros((test_df.shape[0], num_labels))


In [7]:
# Step 3: Initialize a tokenizer from HuggingFace (BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

# Step 4: Tokenize X_train, X_valid, and X_test using the tokenizer (without using embeddings)
def tokenize_texts(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

# Tokenize the training, validation, and test sets
X_train_tokenized = tokenize_texts(X_train.tolist())
X_valid_tokenized = tokenize_texts(X_valid.tolist())
X_test_tokenized = tokenize_texts(test_df['Cleaned_Tweet'].tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Step 5: Create HuggingFace Datasets using Dataset.from_dict
trainset = Dataset.from_dict({
    'input_ids': X_train_tokenized['input_ids'],
    'attention_mask': X_train_tokenized['attention_mask'],
    'labels': y_train_array.tolist()
})

validset = Dataset.from_dict({
    'input_ids': X_valid_tokenized['input_ids'],
    'attention_mask': X_valid_tokenized['attention_mask'],
    'labels': y_valid_array.tolist()
})

testset = Dataset.from_dict({
    'input_ids': X_test_tokenized['input_ids'],
    'attention_mask': X_test_tokenized['attention_mask'],
    'labels': test_labels_placeholder.tolist()  # Placeholder labels for test set
})


In [9]:
# Set format for PyTorch
trainset.set_format('torch')
validset.set_format('torch')
testset.set_format('torch')

In [20]:
# Step 6: Calculate class weights based on label imbalance
# For each label, compute the weight as total_samples / (num_classes * number of samples with that label)
class_counts = np.sum(y_train_array, axis=0)
total_samples = len(y_train_array)
pos_weights = (total_samples - class_counts) / class_counts  # Calculating pos_weight
pos_weight_tensor = torch.tensor(pos_weights, dtype=torch.float32).to(device)  # Move to device

In [21]:

# Step 7: Define an improved feed-forward neural network with dropout and batch normalization
class ImprovedFeedForwardNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_labels):
        super(ImprovedFeedForwardNN, self).__init__()
        # Embedding layer (Randomly initialized embeddings, NOT pre-trained)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * 128, 512)  # 128 max tokens, and embed_dim for each token
        self.batchnorm1 = nn.BatchNorm1d(512)  # Batch normalization after first FC layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Add dropout to prevent overfitting
        self.fc2 = nn.Linear(512, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        embedded = self.embedding(input_ids)  # Convert input_ids to embeddings
        embedded_flattened = embedded.view(embedded.size(0), -1)  # Flatten the embedding layer
        x = self.fc1(embedded_flattened)
        x = self.batchnorm1(x)  # Apply batch normalization
        x = self.relu(x)
        x = self.dropout(x)  # Apply dropout
        logits = self.fc2(x)

        loss = None
        if labels is not None:
            labels = labels.float()
            # Apply pos_weight in BCEWithLogitsLoss for handling imbalance
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
            loss = loss_fct(logits, labels)
        return loss, logits

In [22]:
# Step 8: Define a custom compute_metrics function to evaluate the F1 score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0.5).astype(int)  # Binary predictions
    f1 = f1_score(labels, predictions, average='micro')  # Micro F1 score for multilabel classification
    return {'f1': f1}

In [23]:
# Step 9: Define training arguments for HuggingFace Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increase the number of epochs for better learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-5,  # Tune the learning rate
)



In [24]:
# Step 10: Initialize the custom model
vocab_size = tokenizer.vocab_size
embed_dim = 128  # Can be adjusted
model = ImprovedFeedForwardNN(vocab_size=vocab_size, embed_dim=embed_dim, num_labels=num_labels)


In [25]:
# Move the model to the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Step 11: Initialize the HuggingFace Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=validset,
    compute_metrics=compute_metrics
)

In [26]:
# Step 12: Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,1.1025,1.101011,0.102421
2,0.9813,1.125771,0.196844
3,0.8161,1.141234,0.23723
4,0.8046,1.147555,0.24958


Epoch,Training Loss,Validation Loss,F1
1,1.1025,1.101011,0.102421
2,0.9813,1.125771,0.196844
3,0.8161,1.141234,0.23723
4,0.8046,1.147555,0.24958
5,0.7077,1.166396,0.25891


TrainOutput(global_step=3865, training_loss=0.8973992902896383, metrics={'train_runtime': 864.7254, 'train_samples_per_second': 35.728, 'train_steps_per_second': 4.47, 'total_flos': 0.0, 'train_loss': 0.8973992902896383, 'epoch': 5.0})

In [28]:
# Step 13: Generate predictions on the test data
predictions = trainer.predict(testset)

# Convert predictions to binary format
predicted_labels = np.where(predictions.predictions > 0.5, 1, 0)

In [31]:
# Step 14: Prepare the submission file
submission_df = pd.read_csv('sample_submission.csv')
submission_df.iloc[:, 1:] = predicted_labels  # Fill predictions in the correct columns
submission_df.to_csv('submission.csv', index=False)


In [32]:
# Download the saved submission file
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>