In [1]:
import pandas as pd

df = pd.read_csv("data/trustpilot_company_descriptions.csv")
df.head()

Unnamed: 0,category,company,description
0,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...
1,Animals & Pets,protect-mypet.com,A truly tailored solution to parasite protecti...
2,Animals & Pets,vetscriptions.co.uk,We care about your pets and believe that they ...
3,Animals & Pets,animal-health.co.uk,"With market leading products, numerous awards ..."
4,Animals & Pets,www.travellingpet.vet,I am a veterinary surgeon qualified to complet...


In [2]:
len(df)

1680

In [3]:
df["category"].value_counts(normalize=True)

category
Restaurants & Bars              0.059524
Food, Beverages & Tobacco       0.055357
Business Services               0.052976
Sports                          0.051786
Education & Training            0.051190
Hobbies & Crafts                0.050000
Home Services                   0.049405
Animals & Pets                  0.049405
Public & Local Services         0.047619
Legal Services & Government     0.046429
Events & Entertainment          0.045238
Home & Garden                   0.045238
Health & Medical                0.045238
Beauty & Well-being             0.042857
Money & Insurance               0.041667
Electronics & Technology        0.041071
Utilities                       0.040476
Shopping & Fashion              0.039881
Construction & Manufacturing    0.039881
Vehicles & Transportation       0.035714
Media & Publishing              0.035119
Travel & Vacation               0.033929
Name: proportion, dtype: float64

In [4]:
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Encode categories
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['category'])

# TODO: make this a 3 split (train, val, test)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.2, stratify=df_train['labels'], random_state=42)

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# hyperparameters
num_classes = len(label_encoder.classes_)
epochs = 10


model = AutoModelForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=num_classes)
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")


def tokenize_dataset(dataset):
    return tokenizer(dataset["description"], padding="max_length", truncation=True, max_length=128)

ds_test = ds_test.map(tokenize_dataset, batched=True)
ds_train = ds_train.map(tokenize_dataset, batched=True)
ds_val = ds_val.map(tokenize_dataset, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/336 [00:00<?, ? examples/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    push_to_hub=False,
    use_cpu=True,
    dataloader_num_workers=1,
    eval_strategy="steps",
    eval_steps=100,
    logging_first_step=True,
    load_best_model_at_end=True,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,2.0287,2.313041,0.427509,0.395632,0.475868,0.427509
200,2.0287,2.183827,0.475836,0.430103,0.478064,0.475836
300,2.0287,2.092008,0.472119,0.429508,0.484208,0.472119
400,2.0287,2.006464,0.524164,0.485656,0.517542,0.524164
500,1.9407,1.919686,0.542751,0.502907,0.546129,0.542751
600,1.9407,1.866227,0.561338,0.532848,0.597899,0.561338
700,1.9407,1.82565,0.572491,0.54081,0.594204,0.572491
800,1.9407,1.777943,0.576208,0.548036,0.596566,0.576208
900,1.9407,1.750502,0.579926,0.554305,0.628854,0.579926
1000,1.339,1.734135,0.591078,0.564576,0.638187,0.591078


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1350, training_loss=1.501299044644391, metrics={'train_runtime': 80.6562, 'train_samples_per_second': 133.282, 'train_steps_per_second': 16.738, 'total_flos': 3435734400000.0, 'train_loss': 1.501299044644391, 'epoch': 10.0})

In [11]:
# evaluate on test set
trainer.evaluate(ds_test)

{'eval_loss': 2.627720832824707,
 'eval_accuracy': 0.38095238095238093,
 'eval_f1': 0.38095238095238093,
 'eval_precision': 0.38095238095238093,
 'eval_recall': 0.38095238095238093,
 'eval_runtime': 0.491,
 'eval_samples_per_second': 684.253,
 'eval_steps_per_second': 85.532,
 'epoch': 10.0}