In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from torch.optim.lr_scheduler import ReduceLROnPlateau



In [9]:
BATCH_SIZE = 8
LR = 3e-5
NUM_EPOCHS = 10
EARLY_STOPPING_PATIENCE = 5

# BATCH_SIZE = 8
# LR = 3e-5
# NUM_EPOCHS = 10

# second best
# BATCH_SIZE = 16
# LR = 3e-5
# NUM_EPOCHS = 10

# BATCH_SIZE = 16
# LR = 2e-5
# NUM_EPOCHS = 10


# BATCH_SIZE = 16
# LR = 5e-5
# NUM_EPOCHS = 10

#best
# BATCH_SIZE = 8
# LR = 2e-5
# NUM_EPOCHS = 10
# EARLY_STOPPING_PATIENCE = 5

# BATCH_SIZE = 16
# LR = 1e-5
# NUM_EPOCHS = 10

# BATCH_SIZE = 8
# LR = 1e-5
# NUM_EPOCHS = 10

In [10]:
data = pd.read_csv('/Users/vladcalomfirescu/Desktop/MyFiles/DEV/ML/Veridion-Project/data1/all_data.csv')
data['Label'] = (data['Label'] == 'PRODUCT').astype(int)
data['Text'] = data['Text'].str.title()

print(data.head(3))

                                                Text  Label
0         Factory Buys 32Cm Euro Top Mattress - King      1
1  Savannah Grey Bed Frame Fabric Gas Lift Storag...      1
2  Azalea Led Bed Frame Pu Leather Gas Lift Stora...      1


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_data = tokenizer(list(data['Text']), truncation=True, padding=True, return_tensors='pt', max_length=128)

labels = torch.tensor(list(data['Label']))

dataset = torch.utils.data.TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)

train_dataset, val_dataset = train_test_split(dataset, test_size=0.1, random_state=42)


In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)
loss_fn = torch.nn.BCEWithLogitsLoss()

best_val_loss = float('inf')
early_stopping_counter = 0



In [14]:
for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = batch

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels.unsqueeze(1).float())
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss / len(train_loader)}')

        model.eval()
        val_predictions = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = torch.sigmoid(outputs.logits.squeeze())

                val_predictions.extend(predictions.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_loss = loss_fn(torch.tensor(val_predictions).squeeze(), torch.tensor(val_labels).float())

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
                print(f'Early stopping after {epoch + 1} epochs')
                break


Epoch 1/10, Loss: 0.06799278510734438
Epoch 2/10, Loss: 0.02727182968519628
Epoch 3/10, Loss: 0.01643283860757947
Epoch 4/10, Loss: 0.014377604820765555
Epoch 5/10, Loss: 0.013140137866139412
Epoch 00005: reducing learning rate of group 0 to 3.0000e-06.
Epoch 6/10, Loss: 0.011712776636704803
Epoch 7/10, Loss: 0.011168195377103984
Early stopping after 7 epochs


In [15]:
model.save_pretrained('furniture_model1')

In [19]:
loaded_model = BertForSequenceClassification.from_pretrained('furniture_model1')

def predict_text(model, text, threshold=0.5):
    model.eval()
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors='pt', max_length=128)
    outputs = loaded_model(**inputs)
    prediction = torch.sigmoid(outputs.logits.squeeze()).item()

    print(f"Predicted Probability: {prediction}")

    if prediction >= threshold:
        print("Predicted Label: Furniture")
    else:
        print("Predicted Label: Not Furniture")

predict_text(loaded_model, "Azalea Bed Frame", threshold=0.73)

Predicted Probability: 0.739781379699707
Predicted Label: Furniture
