In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import numpy as np
from unidecode import unidecode
import os
import xgboost as xgb
import json
import re
from unidecode import unidecode
from bs4 import BeautifulSoup

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    torch.set_float32_matmul_precision('high')
    torch.set_num_threads(1)

    
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
def pre_process(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = unidecode(text)

    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

# Load in the data

In [4]:
root = 'reviews/'

In [5]:
train_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in range(6)], ignore_index=True)
test_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in [6,7]], ignore_index=True)

In [6]:
train_data['review_body'] = train_data['review_body'].apply(pre_process)
test_data['review_body'] = test_data['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [7]:
validation_hidden_df = pd.read_csv(root + 'validation_hidden.csv')
test_hidden_df = pd.read_csv(root + 'test_hidden.csv')

validation_hidden_df['review_body'] = validation_hidden_df['review_body'].apply(pre_process)
test_hidden_df['review_body'] = test_hidden_df['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()


In [8]:
#train_data, test_data = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [9]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):

        with open(root + "category.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])

        
        texts = [str(df.category_name.tolist()[i]) + ' '+ str(df.review_headline.tolist()[i]) + ' ' + str(df.category_name.tolist()[i]) + ' ' + str(df.review_body.tolist()[i])
            for i in range(len(df))
        ]
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )

        if 'label' in df.columns.tolist():
            self.labels = torch.tensor(df.label.tolist(), dtype=torch.float)
        else:
            self.labels = None

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx]
        }
        if self.labels is not None:
            item["label"] = self.labels[idx]
        return item


In [10]:
class TransformerForBinaryClassification(nn.Module):
    def __init__(self, pretrained_model_name):
        super(TransformerForBinaryClassification, self).__init__()
        self.transformer = AutoModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.text_classifier = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            hidden_states = outputs.last_hidden_state
            pooled_output = (hidden_states * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        logits = self.text_classifier(pooled_output)

        return logits


In [11]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device).float() 

            logits = model(input_ids, attention_mask).view(-1) 
            loss = criterion(logits, labels)

            total_loss += loss.item()

            predictions = torch.sigmoid(logits)
            predicted_labels = (predictions > 0.5).float()
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    return avg_loss, accuracy


# Finetune the pre-trained model

In [12]:
#model_name = 'google-bert/bert-base-multilingual-cased'
#model_name = 'FacebookAI/xlm-roberta-base'
model_name = "distilbert/distilbert-base-uncased"
#model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment" 
#model_name = "microsoft/deberta-v3-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TransformerForBinaryClassification(model_name)


train_dataset = TextDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8)

test_dataset = TextDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 5
best_test_accuracy = 0.0 
best_model_path = "models/best_model.pth"  

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        
        loss = criterion(logits.view(-1), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predictions = torch.sigmoid(logits).view(-1)
        predicted_labels = (predictions > 0.5).float()
        correct += (predicted_labels == labels).sum().item()
        total += labels.size(0)

    train_loss = total_loss / len(train_loader)
    train_accuracy = correct / total
    print(f"Epoch {epoch+1}: Train Loss = {round(train_loss,3)}, Train Accuracy = {round(train_accuracy,3)}")

    test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
    print(f"Epoch {epoch+1}: Test Loss = {round(test_loss,3)}, Test Accuracy = {round(test_accuracy,3)}")

    torch.save(model.state_dict(), f"models/bert_epoch_{epoch+1}.pth")

    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        torch.save(model.state_dict(), best_model_path)


Epoch 1/5:   7%|██████████████████▏                                                                                                                                                                                                                                             | 65/914 [00:46<10:07,  1.40it/s]

KeyboardInterrupt



# Generate the files for online submission

In [None]:
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

def generate_predictions_csv(df, filename, model, tokenizer, device):
    dataset = TextDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Processing {filename}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).view(-1)
            preds = (probs > 0.5).tolist()

            predictions.extend(preds)

    df_predictions = pd.DataFrame(predictions) 
    df_predictions.to_csv(filename, index=False, header=False)

generate_predictions_csv(validation_hidden_df, "validation_hidden.csv", model, tokenizer, device)
generate_predictions_csv(test_hidden_df, "test_hidden.csv", model, tokenizer, device)

