In [77]:
import pandas as pd

df = pd.read_csv("data/trustpilot_company_descriptions.csv")
df.head()

Unnamed: 0,category,company,description
0,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...
1,Animals & Pets,protect-mypet.com,A truly tailored solution to parasite protecti...
2,Animals & Pets,vetscriptions.co.uk,We care about your pets and believe that they ...
3,Animals & Pets,animal-health.co.uk,"With market leading products, numerous awards ..."
4,Animals & Pets,www.travellingpet.vet,I am a veterinary surgeon qualified to complet...


In [78]:
len(df)

1680

In [82]:
df["category"].value_counts(normalize=True)

category
Restaurants & Bars              0.059524
Food, Beverages & Tobacco       0.055357
Business Services               0.052976
Sports                          0.051786
Education & Training            0.051190
Hobbies & Crafts                0.050000
Home Services                   0.049405
Animals & Pets                  0.049405
Public & Local Services         0.047619
Legal Services & Government     0.046429
Events & Entertainment          0.045238
Home & Garden                   0.045238
Health & Medical                0.045238
Beauty & Well-being             0.042857
Money & Insurance               0.041667
Electronics & Technology        0.041071
Utilities                       0.040476
Shopping & Fashion              0.039881
Construction & Manufacturing    0.039881
Vehicles & Transportation       0.035714
Media & Publishing              0.035119
Travel & Vacation               0.033929
Name: proportion, dtype: float64

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# https://research.google/blog/a-fast-wordpiece-tokenization-system/
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

import torch.nn as nn

label_encoder = LabelEncoder()

def wordpiece_tokenize(text, max_length=128, truncation=True):
	
	# Tokenize the text using WordPiece
	# https://huggingface.co/docs/tokenizers/quicktour
	all_ids = tokenizer.encode(text).ids

	if len(all_ids) > max_length and truncation:
		all_ids = all_ids[:max_length]
	# Pad the sequence to max_length
	if len(all_ids) < max_length:
		all_ids += [0] * (max_length - len(all_ids))
	# Convert to tensor
	all_ids = torch.tensor(all_ids, dtype=torch.long)

	return {
		'input_ids': all_ids,
	}

# Encode categories
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Tokenize descriptions
class TextDataset(Dataset):
	def __init__(self, descriptions, labels, tokenizer, max_len=128):
		self.descriptions = descriptions
		self.labels = labels
		self.tokenizer = tokenizer
		self.max_len = max_len

	def __len__(self):
		return len(self.descriptions)

	def __getitem__(self, idx):
		text = self.descriptions[idx]
		label = self.labels[idx]
		encoding = self.tokenizer(
			text,
			max_length=self.max_len,
		)
		return {
			'input_ids': encoding['input_ids'].squeeze(0),
			'label': torch.tensor(label, dtype=torch.long)
		}

# sample initially for dev: df = df.sample(frac=0.1, random_state=42)
descriptions = df['description'].tolist()
labels = df['category_encoded'].tolist()

# TODO: make this a 3 split (train, val, test)
X_train, X_test, y_train, y_test = train_test_split(descriptions, labels, test_size=0.2, stratify=labels, random_state=42)

train_dataset = TextDataset(X_train, y_train, wordpiece_tokenize)
test_dataset = TextDataset(X_test, y_test, wordpiece_tokenize)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [80]:
from tqdm import tqdm

# Define LSTM with Attention
class SimpleLSTM(nn.Module):
	def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
		super(SimpleLSTM, self).__init__()
		self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
		self.dropout = nn.Dropout(0.5)
		self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
		self.fc = nn.Linear(hidden_dim, num_classes)

	# TODO: 
	# - update the forward pass to use the max/mean of all hidden states
	# - stack more LSTM layers, and use a bidirectional LSTM
	def forward(self, input_ids):
		embedded = self.dropout(self.embedding(input_ids))
		lstm_out, _ = self.lstm(embedded)
		output = self.fc(lstm_out[:,-1])
		return output

# Model, Loss, Optimizer
# https://huggingface.co/google-bert/bert-base-uncased#preprocessing
vocab_size = tokenizer.get_vocab_size()
embed_dim = 128
hidden_dim = 128
num_classes = len(label_encoder.classes_)
epochs = 5

model = SimpleLSTM(vocab_size, embed_dim, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
# later: use gpu for training by `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')`
device = torch.device('cpu')
model.to(device)

SimpleLSTM(
  (embedding): Embedding(30522, 128, padding_idx=0)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=22, bias=True)
)

In [81]:

for epoch in range(epochs):
	model.train()
	total_loss = 0
	for batch in tqdm(train_loader):
		input_ids = batch['input_ids'].to(device)
		labels = batch['label'].to(device)

		optimizer.zero_grad()
		outputs = model(input_ids)
		loss = criterion(outputs, labels)
		loss.backward()
		optimizer.step()
		total_loss += loss.item()

	print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
	# TODO: validate the model after each epoch

# Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
	for batch in test_loader:
		input_ids = batch['input_ids'].to(device)
		labels = batch['label'].to(device)

		outputs = model(input_ids)
		_, predicted = torch.max(outputs, 1)
		total += labels.size(0)
		correct += (predicted == labels).sum().item()

# TODO: the quality of the model is not good at the moment
# - log training and validation loss to understand the training process
# - try out different learning rates, batch sizes, number of epochs, add early stopping
print(f"Test Accuracy: {correct / total:.2f}")

100%|██████████| 42/42 [00:26<00:00,  1.57it/s]


Epoch 1, Loss: 3.09078506061009


100%|██████████| 42/42 [00:16<00:00,  2.55it/s]


Epoch 2, Loss: 3.0617327860423496


100%|██████████| 42/42 [00:20<00:00,  2.01it/s]


Epoch 3, Loss: 3.031206204777672


100%|██████████| 42/42 [00:18<00:00,  2.31it/s]


Epoch 4, Loss: 2.9967587902432395


100%|██████████| 42/42 [00:23<00:00,  1.77it/s]


Epoch 5, Loss: 2.960792002223787
Test Accuracy: 0.09
