In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import numpy as np
from unidecode import unidecode
import os
import xgboost as xgb
import json
import re
from unidecode import unidecode
from bs4 import BeautifulSoup

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    torch.set_float32_matmul_precision('high')
    torch.set_num_threads(1)

    
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
def pre_process(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = unidecode(text)

    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

# Load in the data

In [4]:
root = 'reviews/'

In [5]:
train_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in range(5)], ignore_index=True)
test_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in [6,7]], ignore_index=True)

In [6]:
train_data['review_body'] = train_data['review_body'].apply(pre_process)
test_data['review_body'] = test_data['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [7]:
validation_hidden_df = pd.read_csv(root + 'validation_hidden.csv')
test_hidden_df = pd.read_csv(root + 'test_hidden.csv')

validation_hidden_df['review_body'] = validation_hidden_df['review_body'].apply(pre_process)
test_hidden_df['review_body'] = test_hidden_df['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()


In [8]:
#train_data, test_data = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [9]:
train_data

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
0,9,B001N2MZT8,903886718,Green Zone [DVD],N,Y,green zone,i found at first it was a little difficult to ...,2010-11-15,1,3,False
1,11,B00GCBVE0Q,282740618,Le secret de Green Knowe,N,Y,,j'ai aime cette histoire. les acteurs et surto...,2014-11-23,2,3,False
2,19,1423165691,883799517,A Disney Sketchbook.,N,N,okay mais...,estce une coincidence que la plupart des princ...,2012-12-22,0,0,False
3,33,0061091480,623343977,Your Erroneous Zones,N,N,Arrogant,wayne dyer is a popular american personal grow...,2009-07-21,0,0,True
4,34,B00HZ4CYOY,647510225,König der Mathematik Junior,N,Y,Tớllé Máthé Ápp...,.....unsere kids mogen diese art des lernens. ...,2015-06-01,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
6077,11976,B009R9WAZ8,780895978,Unapologetic,N,Y,fan !!,"tout d'abord, je precise que mon commentaire n...",2013-07-17,2,6,True
6078,11988,B000FO8968,635536816,Star Wars: Episode IV - Eine neue Hoffnung (Or...,N,N,Fast Perfekt,also ich muss sagen ich war anfangs sehr skept...,2007-05-24,0,3,True
6079,11989,B0006V4F1I,725126142,Kill Bill 1 and 2 (Box Set) [DVD],N,N,Nice but simply Kill Bill 1+2. No extras,if you are looking for a collector's box with ...,2011-02-08,1,3,True
6080,11992,B0031R5K72,47504452,The Book Thief,N,Y,Five Stars,good film good book,2014-12-07,1,10,False


In [10]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):

        with open(root + "category.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])

        
        texts = [
        str(df.category_name.tolist()[i]) + ' ' + 
        str(df.review_headline.tolist()[i]) + ' ' + 
        str(df.category_name.tolist()[i]) + ' ' + 
        ("vine " if df.vine.tolist()[i] == 'Y' else '') + 
        ("verified " if df.verified_purchase.tolist()[i] == 'Y' else '') +
        str(df.review_body.tolist()[i])
        for i in range(len(df))
        ]

        
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )

        if 'label' in df.columns.tolist():
            self.labels = torch.tensor(df.label.tolist(), dtype=torch.float)
        else:
            self.labels = None

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx]
        }
        if self.labels is not None:
            item["label"] = self.labels[idx]
        return item


In [11]:
class TransformerForBinaryClassification(nn.Module):
    def __init__(self, pretrained_model_name):
        super(TransformerForBinaryClassification, self).__init__()
        self.transformer = AutoModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.text_classifier = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            hidden_states = outputs.last_hidden_state
            pooled_output = (hidden_states * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        logits = self.text_classifier(pooled_output)

        return logits


In [12]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device).float() 

            logits = model(input_ids, attention_mask).view(-1) 
            loss = criterion(logits, labels)

            total_loss += loss.item()

            predictions = torch.sigmoid(logits)
            predicted_labels = (predictions > 0.5).float()
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    return avg_loss, accuracy


# Finetune the pre-trained model

In [13]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TransformerForBinaryClassification(model_name)

train_dataset = TextDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TextDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_steps = 10_000  
best_test_accuracy = 0.0
best_model_path = "models/best_model.pth"

step = 0
total_loss = 0
correct = 0
total = 0

model.train()
train_iterator = iter(train_loader)  

while step < max_steps:
    try:
        batch = next(train_iterator)  
    except StopIteration:
        train_iterator = iter(train_loader)  
        batch = next(train_iterator)

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].to(device)

    optimizer.zero_grad()
    logits = model(input_ids, attention_mask)

    loss = criterion(logits.view(-1), labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    predictions = torch.sigmoid(logits).view(-1)
    predicted_labels = (predictions > 0.5).float()
    correct += (predicted_labels == labels).sum().item()
    total += labels.size(0)

    step += 1
    
    if step % 100 == 0:
        train_loss = total_loss / 100
        train_accuracy = correct / total
        print(f"Step {step}: Train Loss = {round(train_loss, 3)}, Train Accuracy = {round(train_accuracy, 3)}")
        total_loss = 0
        correct = 0
        total = 0

        test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
        print(f"Step {step}: Test Loss = {round(test_loss,3)}, Test Accuracy = {round(test_accuracy,3)}")

        torch.save(model.state_dict(), f"models/bert_step_{step}.pth")

        if test_accuracy > best_test_accuracy:
            best_test_accuracy = test_accuracy
            torch.save(model.state_dict(), best_model_path)

print("Training complete.")


Step 50: Train Loss = 0.622, Train Accuracy = 0.657
Step 50: Test Loss = 0.589, Test Accuracy = 0.689
Step 100: Train Loss = 0.587, Train Accuracy = 0.675
Step 100: Test Loss = 0.582, Test Accuracy = 0.694
Step 150: Train Loss = 0.534, Train Accuracy = 0.743
Step 150: Test Loss = 0.594, Test Accuracy = 0.708
Step 200: Train Loss = 0.587, Train Accuracy = 0.695
Step 200: Test Loss = 0.583, Test Accuracy = 0.68
Step 250: Train Loss = 0.539, Train Accuracy = 0.73
Step 250: Test Loss = 0.593, Test Accuracy = 0.691
Step 300: Train Loss = 0.556, Train Accuracy = 0.715
Step 300: Test Loss = 0.566, Test Accuracy = 0.71
Step 350: Train Loss = 0.536, Train Accuracy = 0.743
Step 350: Test Loss = 0.567, Test Accuracy = 0.707
Step 400: Train Loss = 0.563, Train Accuracy = 0.743
Step 400: Test Loss = 0.573, Test Accuracy = 0.706
Step 450: Train Loss = 0.572, Train Accuracy = 0.69
Step 450: Test Loss = 0.57, Test Accuracy = 0.712
Step 500: Train Loss = 0.521, Train Accuracy = 0.745
Step 500: Test Los


KeyboardInterrupt



# Generate the files for online submission

In [14]:
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

def generate_predictions_csv(df, filename, model, tokenizer, device):
    dataset = TextDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Processing {filename}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).view(-1)
            preds = (probs > 0.5).tolist()

            predictions.extend(preds)

    df_predictions = pd.DataFrame(predictions) 
    df_predictions.to_csv(filename, index=False, header=False)

generate_predictions_csv(validation_hidden_df, "validation_hidden.csv", model, tokenizer, device)
generate_predictions_csv(test_hidden_df, "test_hidden.csv", model, tokenizer, device)



  model.load_state_dict(torch.load(best_model_path))
Processing validation_hidden.csv: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157/157 [00:05<00:00, 27.79it/s]
Processing test_hidden.csv: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [00:05<00:00, 28.21it/s]
