In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm


In [100]:
data = pd.read_csv("/Users/vladcalomfirescu/Desktop/MyFiles/DEV/ML/Veridion-Project/data/traning_data.csv")  # Replace with your actual file path

product_data = data[data['Label'] == 'PRODUCT']
product_data['Target'] = 1

train_data, val_data = train_test_split(product_data, test_size=0.2, random_state=42)
print(train_data.head(3))


                                                Text    Label  Target
76                 Guardsman Gold Complete Plus Plan  PRODUCT       1
42  Handmade Artist 's Passion Engineer 's Integrity  PRODUCT       1
49                       COFFEE TABLE | Lulu Tallira  PRODUCT       1


In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
train_texts = train_data['Text'].tolist()
train_labels = train_data['Target'].tolist()
val_texts = val_data['Text'].tolist()
val_labels = val_data['Target'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64, return_tensors='pt')
train_labels = torch.tensor(train_labels, dtype=torch.float)

# Tokenize and encode the validation data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64, return_tensors='pt')
val_labels = torch.tensor(val_labels, dtype=torch.float)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3

In [37]:
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [19]:
model.save_pretrained("bert_model")
tokenizer.save_pretrained("bert_model")
print("Model saved to 'bert_model'")

Model saved to 'bert_model'


In [47]:
tokenizer = BertTokenizer.from_pretrained('bert_model')
model = BertForSequenceClassification.from_pretrained('bert_model')

def predict_text(text, threshold=0.9998):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_proba = torch.softmax(logits, dim=1)[0][1].item()  # Probability for the 'PRODUCT' class
    
    predicted_label = 1 if predicted_proba >= threshold else 0
    
    return predicted_label, predicted_proba

text_to_predict = "Azalea LED Bed Frame PU Leather Gas Lift Storage - Black King"
predicted_label, predicted_proba = predict_text(text_to_predict)

if predicted_label == 1:
    print(f"The model predicts: '{text_to_predict}' is a PRODUCT with probability {predicted_proba:.4f}.")
else:
    print(f"The model predicts: '{text_to_predict}' is NOT a PRODUCT with probability {predicted_proba:.4f}.")

The model predicts: 'Azalea LED Bed Frame PU Leather Gas Lift Storage - Black King' is a PRODUCT with probability 0.9998.
