In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import numpy as np
from unidecode import unidecode
import os
import xgboost as xgb
import json
import re
from unidecode import unidecode
from bs4 import BeautifulSoup
from langdetect import detect


In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    torch.set_float32_matmul_precision('high')
    torch.set_num_threads(1)

    
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
def pre_process(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = unidecode(text)

    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

In [4]:
def detect_language(text):
    try:
        lang = detect(text)
        if lang in ["fr", "de", "en"]:
            return lang.upper()
        else:
            return "UNK"
    except:
        return "UNK"


# Load in the data

In [5]:
root = 'reviews/'

In [6]:
train_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in range(5)], ignore_index=True)
test_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in [5,6,7]], ignore_index=True)

In [7]:
#train_data['language'] = train_data["marketplace_id"].apply(lambda x: {0:"UNK",1:"EN",2:"FR",3:"DE"}[x])
#test_data['language'] = test_data["marketplace_id"].apply(lambda x: {0:"UNK",1:"EN",2:"FR",3:"DE"}[x])

In [8]:
train_data['language'] = train_data["review_body"].apply(lambda x: detect_language(x))
test_data['language'] = test_data["review_body"].apply(lambda x: detect_language(x))

In [9]:
train_data['review_body'] = train_data['review_body'].apply(pre_process)
test_data['review_body'] = test_data['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [10]:
train_data_dict = {
    lang: train_data[train_data["language"] == lang] for lang in train_data["language"].unique()
}

test_data_dict = {
    lang: test_data[test_data["language"] == lang] for lang in test_data["language"].unique()
}


In [11]:
validation_hidden_df = pd.read_csv(root + 'validation_hidden.csv')
test_hidden_df = pd.read_csv(root + 'test_hidden.csv')

validation_hidden_df['review_body'] = validation_hidden_df['review_body'].apply(pre_process)
test_hidden_df['review_body'] = test_hidden_df['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()


In [12]:
validation_hidden_df['language'] = validation_hidden_df["review_body"].apply(lambda x: detect_language(x))
test_hidden_df['language'] = test_hidden_df["review_body"].apply(lambda x: detect_language(x))

In [13]:
#validation_hidden_df['language'] = validation_hidden_df["marketplace_id"].apply(lambda x: {0:"UNK",1:"EN",2:"FR",3:"DE"}[x])
#test_hidden_df['language'] = test_hidden_df["marketplace_id"].apply(lambda x: {0:"UNK",1:"EN",2:"FR",3:"DE"}[x])

In [14]:
validation_hidden_data_dict = {
    lang: validation_hidden_df[validation_hidden_df["language"] == lang] for lang in validation_hidden_df["language"].unique()
}

test_hidden_data_dict = {
    lang: test_hidden_df[test_hidden_df["language"] == lang] for lang in test_hidden_df["language"].unique()
}

In [15]:
#train_data, test_data = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [16]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):

        with open(root + "category.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])

        
        texts = [str(df.category_name.tolist()[i]) + ' '+ str(df.review_headline.tolist()[i]) + ' ' + str(df.category_name.tolist()[i]) + ' ' + str(df.review_body.tolist()[i])
            for i in range(len(df))
        ]
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )

        if 'label' in df.columns.tolist():
            self.labels = torch.tensor(df.label.tolist(), dtype=torch.float)
        else:
            self.labels = None

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx]
        }
        if self.labels is not None:
            item["label"] = self.labels[idx]
        return item


In [17]:
class TransformerForBinaryClassification(nn.Module):
    def __init__(self, pretrained_model_name):
        super(TransformerForBinaryClassification, self).__init__()
        self.transformer = AutoModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.text_classifier = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            hidden_states = outputs.last_hidden_state
            pooled_output = (hidden_states * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        logits = self.text_classifier(pooled_output)

        return logits


In [18]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device).float() 

            logits = model(input_ids, attention_mask).view(-1) 
            loss = criterion(logits, labels)

            total_loss += loss.item()

            predictions = torch.sigmoid(logits)
            predicted_labels = (predictions > 0.5).float()
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    return avg_loss, accuracy


# Finetune the pre-trained model

In [19]:
model_names = {
    'UNK': "distilbert/distilbert-base-uncased",
    'EN': "distilbert/distilbert-base-uncased",
    'FR': "almanach/camembert-base",
    'DE': "TUM/GottBERT_base_best"
}

train_language_dict = {lang: train_data[train_data["language"] == lang] for lang in train_data["language"].unique()}
test_language_dict = {lang: test_data[test_data["language"] == lang] for lang in test_data["language"].unique()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for lang, train_df in train_language_dict.items():
    print(f"Training model for language: {lang}")
    model_name = model_names[lang]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TransformerForBinaryClassification(model_name).to(device)
    
    train_dataset = TextDataset(train_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=8)
    test_dataset = TextDataset(test_language_dict[lang], tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)

    epochs = 3
    best_test_accuracy = 0.0 
    best_model_path = f"models/best_model_{lang}.pth"  

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - {lang}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            
            loss = criterion(logits.view(-1), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            predictions = torch.sigmoid(logits).view(-1)
            predicted_labels = (predictions > 0.5).float()
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

        train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total
        print(f"Epoch {epoch+1}: Train Loss = {round(train_loss,3)}, Train Accuracy = {round(train_accuracy,3)}")

        test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}: Test Loss = {round(test_loss,3)}, Test Accuracy = {round(test_accuracy,3)}")

        torch.save(model.state_dict(), f"models/{lang}_bert_epoch_{epoch+1}.pth")

        if test_accuracy > best_test_accuracy:
            best_test_accuracy = test_accuracy
            torch.save(model.state_dict(), best_model_path)


Training model for language: EN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
Epoch 1/3 - EN: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Epoch 1: Train Loss = 0.549, Train Accuracy = 0.732
Epoch 1: Test Loss = 0.505, Test Accuracy = 0.761


Epoch 2/3 - EN: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [00:29<00:00,  8.64it/s]


Epoch 2: Train Loss = 0.489, Train Accuracy = 0.765
Epoch 2: Test Loss = 0.497, Test Accuracy = 0.765


Epoch 3/3 - EN: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [00:29<00:00,  8.65it/s]


Epoch 3: Train Loss = 0.43, Train Accuracy = 0.798
Epoch 3: Test Loss = 0.506, Test Accuracy = 0.772
Training model for language: FR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
Epoch 1/3 - FR: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Epoch 1: Train Loss = 0.64, Train Accuracy = 0.626
Epoch 1: Test Loss = 0.609, Test Accuracy = 0.669


Epoch 2/3 - FR: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 248/248 [00:56<00:00,  4.42it/s]


Epoch 2: Train Loss = 0.582, Train Accuracy = 0.699
Epoch 2: Test Loss = 0.602, Test Accuracy = 0.685


Epoch 3/3 - FR: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 248/248 [00:56<00:00,  4.42it/s]


Epoch 3: Train Loss = 0.547, Train Accuracy = 0.738
Epoch 3: Test Loss = 0.613, Test Accuracy = 0.689
Training model for language: UNK


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
Epoch 1/3 - UNK: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Epoch 1: Train Loss = 0.535, Train Accuracy = 0.715
Epoch 1: Test Loss = 0.498, Test Accuracy = 0.735


Epoch 2/3 - UNK: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:08<00:00,  8.63it/s]


Epoch 2: Train Loss = 0.49, Train Accuracy = 0.746
Epoch 2: Test Loss = 0.492, Test Accuracy = 0.732


Epoch 3/3 - UNK: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:08<00:00,  8.73it/s]


Epoch 3: Train Loss = 0.467, Train Accuracy = 0.756
Epoch 3: Test Loss = 0.493, Test Accuracy = 0.735
Training model for language: DE


Some weights of RobertaModel were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df['product_category

Epoch 1: Train Loss = 0.588, Train Accuracy = 0.691
Epoch 1: Test Loss = 0.591, Test Accuracy = 0.688


Epoch 2/3 - DE: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 179/179 [00:40<00:00,  4.39it/s]


Epoch 2: Train Loss = 0.548, Train Accuracy = 0.726
Epoch 2: Test Loss = 0.588, Test Accuracy = 0.71


Epoch 3/3 - DE: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 179/179 [00:40<00:00,  4.38it/s]


Epoch 3: Train Loss = 0.513, Train Accuracy = 0.757
Epoch 3: Test Loss = 0.613, Test Accuracy = 0.693


# Generate the files for online submission

In [20]:
validation_hidden_df["original_index"] = validation_hidden_df.index
test_hidden_df["original_index"] = test_hidden_df.index

validation_hidden_data_dict = {
    lang: df.copy() for lang, df in validation_hidden_df.groupby("language")
}
test_hidden_data_dict = {
    lang: df.copy() for lang, df in test_hidden_df.groupby("language")
}

In [21]:
def generate_predictions_csv(df_dict, filename, model_names, device):
    all_predictions = []
    all_languages = []
    all_indices = []
    
    for lang, df in df_dict.items():
        print(f"Generating predictions for language: {lang}")
        model_name = model_names[lang]
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TransformerForBinaryClassification(model_name).to(device)
        
        best_model_path = f"models/best_model_{lang}.pth"
        model.load_state_dict(torch.load(best_model_path, map_location=device))
        model.eval()
        
        dataset = TextDataset(df, tokenizer)
        dataloader = DataLoader(dataset, batch_size=8, shuffle=False)
        
        predictions = []
        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Processing {lang}"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                
                logits = model(input_ids, attention_mask)
                probs = torch.sigmoid(logits).view(-1)
                preds = (probs > 0.5).tolist()
                
                predictions.extend(preds)
        
        all_predictions.extend(predictions)
        all_languages.extend([lang] * len(predictions))
        all_indices.extend(df["original_index"].tolist()) 
    
    df_predictions = pd.DataFrame({
        "original_index": all_indices,
        "language": all_languages,
        "prediction": all_predictions
    })
    
    df_predictions = df_predictions.sort_values(by="original_index").drop(columns=["original_index"])
    
    df_predictions["prediction"].to_csv(filename, index=False, header=False)
    print(f"Saved predictions to {filename}")

generate_predictions_csv(validation_hidden_data_dict, "validation_hidden.csv", model_names, device)
generate_predictions_csv(test_hidden_data_dict, "test_hidden.csv", model_names, device)

Generating predictions for language: DE


Some weights of RobertaModel were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing DE: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 15.49it/s]


Generating predictions for language: EN


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing EN: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:01<00:00, 32.64it/s]


Generating predictions for language: FR


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing FR: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:03<00:00, 16.62it/s]


Generating predictions for language: UNK


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing UNK: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 208.28it/s]


Saved predictions to validation_hidden.csv
Generating predictions for language: DE


Some weights of RobertaModel were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing DE: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47/47 [00:02<00:00, 15.93it/s]


Generating predictions for language: EN


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing EN: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 32.45it/s]


Generating predictions for language: FR


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing FR: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 16.40it/s]


Generating predictions for language: UNK


  model.load_state_dict(torch.load(best_model_path, map_location=device))
Processing UNK: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 210.49it/s]

Saved predictions to test_hidden.csv



