<a href="https://colab.research.google.com/github/Paulmeryan/Gittests/blob/master/demo_finetuning_BERT_on_category_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing dependencies...


In [None]:
!pip install transformers



import packages..

In [None]:
import pandas as pd
import numpy as np
import random
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import re

#a few text cleaning functions.. (not in use here)

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)


def text_clean(x):
    ### Light
    x = x.lower()  # lowercase everything
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x)  # remove links
    x = re.sub(r'http*\S+', ' ', x)
    # cleaning up text
    x = re.sub(r'\'\w+', '', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)

    x = re.sub(r'\s[a-z]\s|\s[0-9]\s', ' ', x)

    return x

**load data.**

In [None]:
pd.set_option('display.max_rows', None)

# Define the file path
data_path = "/content/drive/MyDrive/Colab Notebooks/sampledata/INT_GBR_AMZN_2024-10-03_SPOT_training2.csv"

fields = ['id','CREATIVE_CATEGORY_NAME', 'CREATIVE_NAME','T1','T2']


# Load the Excel file into a Pandas dataframe
# limit to subset for testing purposes
p = 1.0  # 1% of the lines

data_df = pd.read_csv(
         data_path,
         header=0,
         usecols=fields,
         sep=';',
         on_bad_lines='warn',
         skiprows=lambda i: i>0 and random.random() > p
)
#data_df = pd.read_csv(data_path,header=1,usecols=fields, skiprows=skip)
print(data_df.head())
print(data_df.CREATIVE_CATEGORY_NAME.nunique())
print(data_df.CREATIVE_CATEGORY_NAME.unique())


print(data_df.head())

print('number unique categories: ',data_df.shape[0])
print(data_df.head())

**build dataset from dataframe**

In [None]:
# Load your dataset
texts = [...]  # List of product descriptions
labels = [...]  # List of corresponding multi-label categories

texts =  data_df['CREATIVE_NAME'].tolist()  # List of product descriptions
labels_text = data_df['CREATIVE_CATEGORY_NAME'].tolist()  # List of corresponding class labels (text)

# Encode text labels into integer representations
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

# Save the label encoder's classes
np.save("/content/drive/MyDrive/Colab Notebooks/sampledata/INT_GBR_AMZN_2024-10-03_SPOT_training2_label_encoder_classes_1.npy", label_encoder.classes_)



# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

**load model and fine tune..**

In [None]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

# Create dataset and dataloaders
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)



# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 4
for epoch in range(num_epochs):
    # Training
    print('Epoch: ',epoch)
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

    # Validation
    model.eval()
    val_preds = []
    val_targets = []
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_targets.extend(labels.numpy())

    val_accuracy = accuracy_score(val_targets, val_preds)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy}")

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/sampledata/fine_tuned_bert_model_amazon3b")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:  0
Epoch 1/4, Average Training Loss: 4.3142495104061656
Epoch 1/4, Validation Accuracy: 0.494786411032627
Epoch:  1
Epoch 2/4, Average Training Loss: 2.4606086252838053
Epoch 2/4, Validation Accuracy: 0.5923309788092835
Epoch:  2
Epoch 3/4, Average Training Loss: 1.7580551554759343
Epoch 3/4, Validation Accuracy: 0.6387487386478304
Epoch:  3
Epoch 4/4, Average Training Loss: 1.353524170054864
Epoch 4/4, Validation Accuracy: 0.6646485031954255


**# Running inference on new model:**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/sampledata/fine_tuned_bert_model_amazon3b")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


# Load the label encoder
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load("/content/drive/MyDrive/Colab Notebooks/sampledata/INT_GBR_AMZN_2024-10-03_SPOT_training2_label_encoder_classes_1.npy", allow_pickle=True)

# Define a function to classify product descriptions
def classify_product(description):
    # Tokenize the description
    tokens = tokenizer(description, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    tokens = {key: value.to('cuda') for key, value in tokens.items()}
    # Perform inference
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits

    # Get predicted class index
    predicted_class_index = np.argmax(logits.cpu().numpy())

    # Map class index to text label
    predicted_label = label_encoder.classes_[predicted_class_index]

    return predicted_label

# Example new product descriptions
#synthetical from LLM
new_product_descriptions = [
    "Kabcifea 157 Pcs Mini Acrylic Paint Set,22 Sets Acrylic Paint Strips in 12 Colors with 2 Paint Tray,Small Acrylic Paint Set with 20 Pcs Paintbrushes Perfect for Home Birthday Classroom Party Favors",
    "RadiantRelax GlowZen Facial Massager with Vibrations and Warm LED Glow",
    "EcoSweep Sustainable Bamboo Broom for Eco-Friendly Cleanin",
    "TruFit Adjustable Posture Corrector Brace for Pain Relief",
    "PetPamper Luxurious Plush Pet Bed for Cats and Dogs of All Size",
    "TechTune Wireless Bluetooth Earbuds with Superior Sound Quality and Stable Connection",
    "CulinaryCraft Organic Bamboo Cutting Board Set",
    "FitFlex Versatile Resistance Band Set",
    "DreamSense Stylish Aromatherapy Diffuser",
    "GardenGlow Solar String Lights",
    "2 x Dough Cleaning Cloth, Mesh Cloth, Dough Cleaner, Dough Remover, Mesh Dish Cloth, Cleaning Cloth, Double Pack",
    "ERGONOW Cast Iron Chain Scrubber with Fine Ring - 316 Stainless Steel Scouring Pad Pan Cleaner (Standard)"

]
# real, AMazon examples, unseen for the model
new_product_descriptionsX = [
    "Garden of Life Vitamin D, Vitamin Code Raw D3, Vitamin D 5,000 IU, Raw Whole Food Vitamin D Supplements with Chlorella, Fruit, Veggies & Probiotics for Bone & Immune Health. 60 Vegetarian Capsules",
    "Aivituvin Cat House Outdoor Feral Cat Enclosure, Weatherproof Warm Kitty Shelter with Large Balcony, Escape Doors",
    "Why Not Natural Liquid Collagen for Women and Men with Biotin - Marine Collagen Elixir Plus Biotin Drops Supplements for Hair Growth, Skin, and Nails",
    "Logitech G502 Lightspeed Wireless Gaming Mouse with Hero 25K Sensor, PowerPlay Compatible, Tunable Weights and Lightsync RGB - Black",
    "Bartender's Secret Sauce Old Fashioned Mix - Makes 64 Cocktails - Handcrafted Old Fashioned Syrup with Bitters, Orange, Cherry, Organic Cane Sugar - More Complex than Bitters and Simple Syrup - 16-Ounce 1 Pack",
    "Lollipop Suckers - 3 Pounds - Classic Flat Lollipops Individually Wrapped - Tiger Pops - Candy Lollipops for Kids, Doctors Office - Split Flavor Suckers",
    "Japanese Beef Wagyu Ribeye Steak - approx. 1 lb / 454 g - A5 Grade 100% Wagyu from Miyazaki Japan",
    "Electric Milk Frother with Stand, Handheld Electric Foam Maker, Waterproof, Stainless Steel Whisk…",
    "MEGNYA Women's Comfortable Walking Sandals with Arch Support, Athletic Hiking Sandals with Handmade Straps, Outdoor Soft Water Sandals for Beach Poolside Travel Camping",
    "RAYAN Men Arabian Perfume - Oud Modern Eau De Parfum - Long Lasting Perfume for Men - Oud & Grapefruit Perfume with Cardamom, Lavender, & Sandalwood - Ideal Gift for All Occasions - 100 mL Perfume",
    "toolant Impact Hex Head Allen Wrench Drill Bit Set 20pcs (Metric&SAE), 1/4 Hex-Shank S2 Steel Hex Bits Set, CNC Machined Tips with Magnetism, 4 Long with Storage Box",
    "Cat toys including a virtual mouse for playing"
]

# Perform classification for each new product description
for idx, description in enumerate(new_product_descriptions):
    predicted_label = classify_product(description)
    print(f"Product Description {idx+1}: '{description}'")
    print(f"Predicted Label: {predicted_label}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
!pip install transformers-interpret
from transformers_interpret import SequenceClassificationExplainer

In [None]:
multiclass_explainer = SequenceClassificationExplainer(model=model, tokenizer=tokenizer)
word_attributions = multiclass_explainer(text="Logitech G502 Lightspeed Wireless Gaming Mouse with Hero 25K Sensor, PowerPlay Compatible, Tunable Weights and Lightsync RGB - Black")
#word_attributions
html = multiclass_explainer.visualize()

word_attributions = multiclass_explainer(text="toolant Impact Hex Head Allen Wrench Drill Bit Set 20pcs (Metric&SAE), 1/4 Hex-Shank S2 Steel Hex Bits Set, CNC Machined Tips with Magnetism, 4 Long with Storage Box")
#word_attributions
html = multiclass_explainer.visualize()


word_attributions = multiclass_explainer(text="Cat toys including a virtual mouse for playing")
#word_attributions
html = multiclass_explainer.visualize()


# Load the label encoder
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load("/content/drive/MyDrive/Colab Notebooks/sampledata/INT_GBR_AMZN_2024-10-03_SPOT_training2_label_encoder_classes_1.npy", allow_pickle=True)
print(label_encoder.classes_[941])

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
207.0,LABEL_207 (0.49),LABEL_207,2.03,"[CLS] log ##ite ##ch g ##50 ##2 lights ##peed wireless gaming mouse with hero 25 ##k sensor , power ##play compatible , tuna ##ble weights and lights ##yn ##c r ##gb - black [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
292.0,LABEL_292 (0.78),LABEL_292,2.48,"[CLS] tool ##ant impact he ##x head allen wren ##ch drill bit set 20 ##pc ##s ( metric & sa ##e ) , 1 / 4 he ##x - shan ##k s ##2 steel he ##x bits set , cn ##c machine ##d tips with magnet ##ism , 4 long with storage box [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
941.0,LABEL_941 (0.81),LABEL_941,0.89,[CLS] cat toys including a virtual mouse for playing [SEP]
,,,,


PET SUPPLIES;CATS
