In [None]:
import pandas as pd

df = pd.read_csv("data/trustpilot_company_descriptions.csv")
df.head()

In [None]:
len(df)

In [None]:
df["category"].value_counts(normalize=True)

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode categories
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['category'])

# TODO: make this a 3 split (train, val, test)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.2, stratify=df_train['labels'], random_state=42)

from datasets import Dataset

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

In [None]:
from tqdm import tqdm

# hyperparameters
num_classes = len(label_encoder.classes_)
epochs = 10

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

model = AutoModelForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=num_classes)
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

def tokenize_dataset(dataset):
    return tokenizer(dataset["description"], padding="max_length", truncation=True, max_length=512)

ds_test = ds_test.map(tokenize_dataset, batched=True)
ds_train = ds_train.map(tokenize_dataset, batched=True)
ds_val = ds_val.map(tokenize_dataset, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()