In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import numpy as np
from unidecode import unidecode
import os
import xgboost as xgb
import json
import re
from unidecode import unidecode
from bs4 import BeautifulSoup

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    torch.set_float32_matmul_precision('high')
    torch.set_num_threads(1)

    
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
def pre_process(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = unidecode(text)

    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

# Load in the data

In [4]:
root = 'reviews/'

In [5]:
data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in range(7)], ignore_index=True)
#test_data = pd.concat([pd.read_csv(root + f"train-{i+1}.csv") for i in [6,7]], ignore_index=True)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=RANDOM_SEED)

In [6]:
train_data['review_body'] = train_data['review_body'].apply(pre_process)
test_data['review_body'] = test_data['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [7]:
validation_hidden_df = pd.read_csv(root + 'validation_hidden.csv')
test_hidden_df = pd.read_csv(root + 'test_hidden.csv')

validation_hidden_df['review_body'] = validation_hidden_df['review_body'].apply(pre_process)
test_hidden_df['review_body'] = test_hidden_df['review_body'].apply(pre_process)

  text = BeautifulSoup(text, "html.parser").get_text()


In [8]:
train_data

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
6641,5355,B0006GVK2A,460269617,Before Sunrise / Before Sunset [DVD],N,Y,Five Stars,excellent,2015-01-09,1,3,False
3689,729,B000CJD3DU,325991288,Mr. & Mrs. Smith,N,Y,Áctiớn-Klámáúk,ich habe den film im kino gesehen und wenn man...,2006-01-11,0,3,True
3978,3525,B003UOVUR0,543062387,Eclipse - Bis(s) zum Abendrot (Fan Edition) [2...,N,N,Super Film !,der artikel ist quasi wie neu und funktioniert...,2011-11-08,0,3,False
1583,4044,B002QY9RMA,913023308,I am...Sasha Fierce,N,Y,Ich liebe es!,"ich mag dieses album sehr! wer beyonce mag, de...",2014-09-28,0,6,False
1412,2245,B00D3NSDVO,307625827,I Am Pilgrim,N,Y,Four Stars,best thriller i have read in years!,2015-04-01,1,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,8542,B00HUMI5SK,276394270,The Wolf of Wall Street,N,Y,Kein guter Film,ich konnte mich mit dem film nicht wirklich an...,2014-10-13,0,3,True
5191,3116,B005V5WBFG,796588140,McAfee Mobile Security,N,Y,Five Stars,c,2015-08-05,1,1,False
5390,5015,B00B2FLDRQ,274604361,The Rosie Project: Don Tillman 1 (Don Tillman ...,N,Y,Funny and heartwarming,it's a little cheesy but really enjoyable. don...,2014-03-30,1,10,False
860,8870,B001E08UNE,825346055,Matrix [Blu-ray],N,Y,Un film que l'on se plait à revoir,il n'est plus besoin de presenter ce film pour...,2009-03-22,2,3,True


In [9]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):

        with open(root + "category.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        df['category_name'] = df['product_category_id'].apply(lambda x: {d['id']:d['name'].replace("_",' ').lower() for d in data}[x])

        
        texts = [
        str(df.category_name.tolist()[i]) + ' ' + 
        str(df.review_headline.tolist()[i]) + ' ' + 
        str(df.category_name.tolist()[i]) + ' ' + 
        ("vine " if df.vine.tolist()[i] == 'Y' else '') + 
        ("verified " if df.verified_purchase.tolist()[i] == 'Y' else '') +
        str(df.review_body.tolist()[i])
        for i in range(len(df))
        ]

        
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )

        if 'label' in df.columns.tolist():
            self.labels = torch.tensor(df.label.tolist(), dtype=torch.float)
        else:
            self.labels = None

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx]
        }
        if self.labels is not None:
            item["label"] = self.labels[idx]
        return item


In [10]:
class TransformerForBinaryClassification(nn.Module):
    def __init__(self, pretrained_model_name):
        super(TransformerForBinaryClassification, self).__init__()
        self.transformer = AutoModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.text_classifier = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            hidden_states = outputs.last_hidden_state
            pooled_output = (hidden_states * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        logits = self.text_classifier(pooled_output)

        return logits


In [11]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device).float() 

            logits = model(input_ids, attention_mask).view(-1) 
            loss = criterion(logits, labels)

            total_loss += loss.item()

            predictions = torch.sigmoid(logits)
            predicted_labels = (predictions > 0.5).float()
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    return avg_loss, accuracy


# Finetune the pre-trained model

In [12]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TransformerForBinaryClassification(model_name)

train_dataset = TextDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TextDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_steps = 10_000  
best_test_accuracy = 0.0
best_model_path = "models/best_model.pth"

step = 0
total_loss = 0
correct = 0
total = 0

model.train()
train_iterator = iter(train_loader)  

while step < max_steps:
    try:
        batch = next(train_iterator)  
    except StopIteration:
        train_iterator = iter(train_loader)  
        batch = next(train_iterator)

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].to(device)

    optimizer.zero_grad()
    logits = model(input_ids, attention_mask)

    loss = criterion(logits.view(-1), labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    predictions = torch.sigmoid(logits).view(-1)
    predicted_labels = (predictions > 0.5).float()
    correct += (predicted_labels == labels).sum().item()
    total += labels.size(0)

    step += 1
    
    if step % 100 == 0:
        train_loss = total_loss / 100
        train_accuracy = correct / total
        print(f"Step {step}: Train Loss = {round(train_loss, 3)}, Train Accuracy = {round(train_accuracy, 3)}")
        total_loss = 0
        correct = 0
        total = 0

        test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
        print(f"Step {step}: Test Loss = {round(test_loss,3)}, Test Accuracy = {round(test_accuracy,3)}")

        torch.save(model.state_dict(), f"models/bert_step_{step}.pth")

        if test_accuracy > best_test_accuracy:
            best_test_accuracy = test_accuracy
            torch.save(model.state_dict(), best_model_path)

print("Training complete.")


Step 100: Train Loss = 0.551, Train Accuracy = 0.703
Step 100: Test Loss = 0.567, Test Accuracy = 0.709
Step 200: Train Loss = 0.573, Train Accuracy = 0.682
Step 200: Test Loss = 0.552, Test Accuracy = 0.709
Step 300: Train Loss = 0.577, Train Accuracy = 0.696
Step 300: Test Loss = 0.545, Test Accuracy = 0.729
Step 400: Train Loss = 0.58, Train Accuracy = 0.691
Step 400: Test Loss = 0.562, Test Accuracy = 0.718
Step 500: Train Loss = 0.549, Train Accuracy = 0.718
Step 500: Test Loss = 0.542, Test Accuracy = 0.725
Step 600: Train Loss = 0.555, Train Accuracy = 0.715
Step 600: Test Loss = 0.553, Test Accuracy = 0.722
Step 700: Train Loss = 0.554, Train Accuracy = 0.71
Step 700: Test Loss = 0.538, Test Accuracy = 0.729
Step 800: Train Loss = 0.552, Train Accuracy = 0.718
Step 800: Test Loss = 0.554, Test Accuracy = 0.708
Step 900: Train Loss = 0.549, Train Accuracy = 0.721
Step 900: Test Loss = 0.54, Test Accuracy = 0.723
Step 1000: Train Loss = 0.523, Train Accuracy = 0.743
Step 1000: Te


KeyboardInterrupt



# Generate the files for online submission

In [13]:
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

def generate_predictions_csv(df, filename, model, tokenizer, device):
    dataset = TextDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Processing {filename}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).view(-1)
            preds = (probs > 0.5).tolist()

            predictions.extend(preds)

    df_predictions = pd.DataFrame(predictions) 
    df_predictions.to_csv(filename, index=False, header=False)

generate_predictions_csv(validation_hidden_df, "validation_hidden.csv", model, tokenizer, device)
generate_predictions_csv(test_hidden_df, "test_hidden.csv", model, tokenizer, device)



  model.load_state_dict(torch.load(best_model_path))
Processing validation_hidden.csv: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157/157 [00:05<00:00, 28.08it/s]
Processing test_hidden.csv: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [00:05<00:00, 27.86it/s]
