<a href="https://colab.research.google.com/github/SF001-123456/the404thinkers/blob/main/model_training/DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stage 1: Binary Classification

### Setup and Prepare Data

In [None]:
# Install library
!pip install transformers[touch] datasets evaluate huggingface_hub
# !pip install --upgrade transformers



In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from huggingface_hub import login

# Load data
from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = '/content/drive/MyDrive/datathon/comment data/'
BALANCE_FILE = os.path.join(DRIVE_PATH, 'balanced(34290).csv')

df = pd.read_csv(BALANCE_FILE)
print(f"Loaded {len(df)} rows from merged_comments.csv")

Mounted at /content/drive
Loaded 68580 rows from merged_comments.csv


In [None]:
# Only need cleaned_text and label
df = df[['cleaned_text', 'isProductRelated']]
df = df.rename(columns={'isProductRelated': 'label', 'cleaned_text': 'text'})

# Double check dataset quality
df.dropna(subset=['text'], inplace=True)

# Split data into training and testing (0.2)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Convert pd DF to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into DatasetDict
ds = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 53760
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 13441
    })
})


### Preprocessing and Tokenization

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer for DistilBERT
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create a function to tokenize text
def preprocess_function(examples):
  # The tokenizer will pad and truncate the text to a standard length
  return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization to dataset
tokenized_ds = ds.map(preprocess_function, batched=True)

print(tokenized_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/53760 [00:00<?, ? examples/s]

Map:   0%|          | 0/13441 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 53760
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 13441
    })
})


### Define Model and Training Arguments (without LoRA)

In [None]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Define metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  acc = accuracy.compute(predictions=predictions, references=labels)
  prec = precision.compute(predictions=predictions, references=labels, average="binary")
  rec = recall.compute(predictions=predictions, references=labels, average="binary")
  f1_score = f1.compute(predictions=predictions, references=labels, average="binary")

  return {
      "accuracy": acc["accuracy"],
      "precision": prec["precision"],
      "recall": rec["recall"],
      "f1": f1_score["f1"],
  }

# Define labels
id2label = {0: "Not Product Related", 1: "Product Related"}
label2id = {"Not Product Related": 0, "Product Related": 1}

# Load RoBERTa
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

# Define HF Hub model repo
hub_model_id = "sainoforce/distilbert-base-product-related"

# Define Training Argument
training_args = TrainingArguments(
    output_dir=hub_model_id,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    push_to_hub=True, # This will automatically push the model to the Hub
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start Training
trainer.train()

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0155,0.007315,0.99881,0.99927,0.998396,0.998833
2,0.0043,0.006174,0.998586,0.997961,0.999271,0.998616
3,0.0001,0.007619,0.998958,0.998688,0.999271,0.99898


TrainOutput(global_step=10080, training_loss=0.01337463751775171, metrics={'train_runtime': 1894.1859, 'train_samples_per_second': 85.145, 'train_steps_per_second': 5.322, 'total_flos': 5341085513809920.0, 'train_loss': 0.01337463751775171, 'epoch': 3.0})

### Define Model and Training Arguments (with LoRA)

In [None]:
!pip install peft



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
import evaluate

# Load Tokenizer and Model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load the DistilBERT model first
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Print the model structure to see the layer names
print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Setup LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # Sequence Classification
    r=8,                        # Rank (can experiment with 16)
    lora_alpha=16,              # Scaling factor (often 2x rank)
    target_modules=["q_lin", "v_lin"], # Target the attention layers
    lora_dropout=0.1,
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [None]:
# Define Training Arguments
hub_model_id = "sainoforce/distilbert-base-product-related"

training_args = TrainingArguments(
    output_dir=hub_model_id,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",  # It's good to evaluate to track progress
    load_best_model_at_end=True,  # Important for getting the best version
    metric_for_best_model="f1",   # Use F1 as the deciding metric
    save_total_limit=2,
    report_to="none",
    push_to_hub=True, # Push to Hub at the end
)

# Define Metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        'accuracy': accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'precision': precision_metric.compute(predictions=predictions, references=labels, average='binary')['precision'],
        'recall': recall_metric.compute(predictions=predictions, references=labels, average='binary')['recall'],
        'f1': f1_metric.compute(predictions=predictions, references=labels, average='binary')['f1'],
    }

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'], # Make sure you're using your tokenized datasets
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer, # Pass the tokenizer to save it with the model
)

# Train model
trainer.train()

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1412,0.09143,0.968976,0.958565,0.981627,0.969959
2,0.0596,0.069879,0.979763,0.970832,0.990085,0.980364


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1412,0.09143,0.968976,0.958565,0.981627,0.969959
2,0.0596,0.069879,0.979763,0.970832,0.990085,0.980364
3,0.0702,0.066222,0.981251,0.973345,0.990376,0.981787


TrainOutput(global_step=10080, training_loss=0.13291056709630147, metrics={'train_runtime': 1380.806, 'train_samples_per_second': 116.801, 'train_steps_per_second': 7.3, 'total_flos': 5432692884111360.0, 'train_loss': 0.13291056709630147, 'epoch': 3.0})

### Evaluate and Push to HF

In [None]:
# Run evaluation on test set
final_metrics = trainer.evaluate()
print("Final Evaluation metrics:")
print(final_metrics)

Final Evaluation metrics:
{'eval_loss': 0.06622204929590225, 'eval_accuracy': 0.9812513949854922, 'eval_precision': 0.9733447979363714, 'eval_recall': 0.9903762029746281, 'eval_f1': 0.981786643538595, 'eval_runtime': 51.2716, 'eval_samples_per_second': 262.153, 'eval_steps_per_second': 16.403, 'epoch': 3.0}


In [None]:
# Push final model, tokenizer, and training config to Hub
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e-product-related/training_args.bin: 100%|##########| 5.71kB / 5.71kB            

  ...t-related/adapter_model.safetensors: 100%|##########| 2.96MB / 2.96MB            

  ...e-product-related/model.safetensors:   6%|6         | 16.8MB /  268MB            

CommitInfo(commit_url='https://huggingface.co/junmeng-sf/distilbert-base-product-related/commit/ad15d9d388bdb49244d20c5a5f52b4031524694d', commit_message='End of training', commit_description='', oid='ad15d9d388bdb49244d20c5a5f52b4031524694d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/junmeng-sf/distilbert-base-product-related', endpoint='https://huggingface.co', repo_type='model', repo_id='junmeng-sf/distilbert-base-product-related'), pr_revision=None, pr_num=None)

### Inference

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=hub_model_id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [None]:
import pandas as pd

# --- Sample comments ---
comments = [
    "This foundation blends so smoothly, perfect for my skin tone.",
    "The lipstick color is beautiful but it fades too quickly.",
    "I love how this moisturizer keeps my skin hydrated all day.",
    "Is this serum safe for sensitive skin?",
    "The packaging is cute but the mascara dries out too fast.",
    "This sunscreen leaves a white cast, not good for darker skin tones.",
    "Best shampoo I’ve tried, makes my hair feel so soft.",
    "I bought this face wash and it really reduced my acne.",
    "The fragrance of this body lotion is too strong for me.",
    "Does this eye cream actually reduce dark circles?",
    # Non-beauty
    "Great video, thanks for the explanation!",
    "The background music is really calming.",
    "I laughed so hard at the blooper at the end.",
    "Can you make a tutorial about studying tips?",
    "Subscribed, looking forward to your next video!",
    "I tried this new thing and it was okay…" ,
    "The packaging looks nice but I haven’t used it yet.",
    "Not sure if this really works as advertised.",
    "My skin felt weird after using it…",
    "This color is amazing!" ,
    "I usually don’t like these things, but this is different." ,
    "Feels smooth, but not sure if I’ll buy again.",
    "It smells strong, not really my style." ,
    "The effect is subtle, but noticeable." ,
    "Everyone says this is good, but I don’t see the difference."
]

# --- Bulk test with classifier ---
results = [classifier(c) for c in comments]

# --- Save into DataFrame ---
df_results = pd.DataFrame({
    "comment": comments,
    "prediction": results
})

print(df_results)

# df_results.to_csv("/content/drive/MyDrive/datathon/results/beauty_classifier_distilbert_results.csv", index=False)

                                              comment  \
0   This foundation blends so smoothly, perfect fo...   
1   The lipstick color is beautiful but it fades t...   
2   I love how this moisturizer keeps my skin hydr...   
3              Is this serum safe for sensitive skin?   
4   The packaging is cute but the mascara dries ou...   
5   This sunscreen leaves a white cast, not good f...   
6   Best shampoo I’ve tried, makes my hair feel so...   
7   I bought this face wash and it really reduced ...   
8   The fragrance of this body lotion is too stron...   
9   Does this eye cream actually reduce dark circles?   
10           Great video, thanks for the explanation!   
11            The background music is really calming.   
12       I laughed so hard at the blooper at the end.   
13       Can you make a tutorial about studying tips?   
14    Subscribed, looking forward to your next video!   
15            I tried this new thing and it was okay…   
16  The packaging looks nice bu

## Stage 2: Multi-Class Classification

### Refined Keyword List

In [None]:
category_keywords = {
    "makeup": [
        "makeup", "cosmetic", "foundation", "concealer", "primer", "blush", "bronzer",
        "highlighter", "powder", "setting spray", "lipstick", "lip gloss", "lip liner", "lip stain",
        "mascara", "eyeliner", "eyeshadow", "brow pencil", "fake lashes", "beauty blender"
    ],
    "skincare": [
        "skincare", "cleanser", "toner", "moisturizer", "lotion", "cream", "serum",
        "essence", "sunscreen", "sunblock", "spf", "sheet mask", "clay mask",
        "exfoliator", "peel", "retinol", "hyaluronic acid", "niacinamide", "vitamin c",
        "collagen", "peptide", "anti-aging", "acne", "blemish"
    ],
    "haircare": [
        "shampoo", "conditioner", "haircare", "hair mask", "hair oil", "hairspray",
        "mousse", "gel", "dry shampoo", "heat protectant", "keratin", "leave-in",
        "scalp treatment", "hair treatment"
    ],
    "haircolor": [
        "hair color", "hair dye", "dyeing", "bleach", "highlights", "roots", "box dye",
        "color-treated", "toner" # Note: toner can be skincare or haircolor, context matters. Keyword matching is imperfect.
    ],
    "fragrance": [
        "fragrance", "perfume", "cologne", "eau de toilette", "scent"
    ]
}

brand_keywords = ["loreal", "l'oréal", "maybelline", "garnier", "nyx", "lancôme", "kiehl's", "cerave"]  # maybe can use this at final pipeline

### Data Preparation - Data Labelling

In [None]:
import pandas as pd
import os

# Load dataset
DRIVE_PATH = '/content/drive/MyDrive/datathon/comment data/'
BALANCE_FILE = os.path.join(DRIVE_PATH, 'balanced(34290).csv')

# Load the full balanced dataset
df_full = pd.read_csv(BALANCE_FILE)

# For Stage 2, only use comments that are product-related
df_product_related = df_full[df_full['isProductRelated'] == 1].copy()

df_stage2 = df_product_related[['cleaned_text']].copy()

print(f"Original number of comments: {len(df_full)}")
print(f"Number of product-related comments for Stage 2: {len(df_stage2)}")

Original number of comments: 68580
Number of product-related comments for Stage 2: 34290


In [None]:
# Define labelling function
def assign_category_by_counts(text):
  if not isinstance(text, str):
    return "unlabeled"

  text_lower = text.lower()

  category_scores = {category: 0 for category in category_keywords.keys()}

  # Count keyword occurrences for each category
  for category, keywords in category_keywords.items():
    for keyword in keywords:
        if keyword in text_lower:
            category_scores[category] += 1

  # Find the category with the highest score
  max_score = 0
  best_category = "unlabeled" # Default if no keywords are found
  for category, score in category_scores.items():
      if score > max_score:
          max_score = score
          best_category = category

  return best_category

# Apply the function to create our new 'category' label column
df_stage2['category'] = df_stage2['cleaned_text'].apply(assign_category_by_counts)

In [None]:
# Distribution of new labels
print("Category distribution:")
print(df_stage2['category'].value_counts())

# Filter out comments that failed to label
# Can be caused if comment was marked isProductRelated=1 by a generic term
# Such as beauty
df_stage2_labeled = df_stage2[df_stage2['category'] != 'unlabeled'].copy()

print(f"\nTotal labeled comments for Stage 2 training: {len(df_stage2_labeled)}")
print(f"Dropped {len(df_stage2) - len(df_stage2_labeled)} unlabeled comments.")

# Sanity Check a few samples
print("\n--- Sanity Check ---")
for category in category_keywords.keys():
    print(f"\nExample for category: {category}")
    example_comment = df_stage2_labeled[df_stage2_labeled['category'] == category]['cleaned_text'].iloc[0]
    print(example_comment)

Category distribution:
category
makeup       18294
unlabeled     7449
skincare      4855
haircare      1512
haircolor     1221
fragrance      959
Name: count, dtype: int64

Total labeled comments for Stage 2 training: 26841
Dropped 7449 unlabeled comments.

--- Sanity Check ---

Example for category: makeup
foundation stick shade please

Example for category: skincare
girlies like boohoo bf wont skincare love

Example for category: haircare
girl ok deep condition wig brush conditioner use warm water washing curl youtube tutorial make sure look specifically synthetic hair wig

Example for category: haircolor
girl seen like video different hair color

Example for category: fragrance
thats fragrance clerk ghetto like


### Prepare Data for Model

In [None]:
# Import Library
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate

In [None]:
# Create integer labels from the 'category' column
labels = df_stage2_labeled['category'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
df_stage2_labeled['label'] = df_stage2_labeled['category'].map(label2id)

print("--- Label Mappings ---")
print(id2label)

--- Label Mappings ---
{0: 'makeup', 1: 'haircare', 2: 'skincare', 3: 'fragrance', 4: 'haircolor'}


In [None]:
# Prepare the final columns needed: 'text' and 'label'
df_for_training = df_stage2_labeled[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'})

# Split the data into training and testing sets (80/20 split)
train_df, test_df = train_test_split(
    df_for_training, test_size=0.2, random_state=42, stratify=df_for_training['label']
)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
category_ds = DatasetDict({'train': train_dataset, 'test': test_dataset})
print("\n--- Final Dataset for Stage 2 ---")
print(category_ds)


--- Final Dataset for Stage 2 ---
DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 21472
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5369
    })
})


### Tokenization

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_category_ds = category_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/21472 [00:00<?, ? examples/s]

Map:   0%|          | 0/5369 [00:00<?, ? examples/s]

### Define Multi-Class Model with LoRA

In [None]:
num_labels = len(labels) # This will be 5
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"],
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 741,893 || all params: 67,699,210 || trainable%: 1.0959


### Train Model

In [None]:
category_model_id = "sainoforce/distilbert-base-category-classifier"

# Define training_args
training_args = TrainingArguments(
    output_dir=category_model_id,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    push_to_hub=True,
)

# Define Metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # *** CRITICAL CHANGE: Use average='weighted' for multi-class ***
    # This accounts for class imbalance.
    return {
        'accuracy': accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'precision': precision_metric.compute(predictions=predictions, references=labels, average='weighted')['precision'],
        'recall': recall_metric.compute(predictions=predictions, references=labels, average='weighted')['recall'],
        'f1': f1_metric.compute(predictions=predictions, references=labels, average='weighted')['f1'],
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_category_ds['train'],
    eval_dataset=tokenized_category_ds['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5032,0.23365,0.927733,0.926507,0.927733,0.926703
2,0.1954,0.162305,0.948966,0.94867,0.948966,0.948721
3,0.154,0.148169,0.95176,0.951412,0.95176,0.951487


TrainOutput(global_step=4026, training_loss=0.329936290107079, metrics={'train_runtime': 553.0092, 'train_samples_per_second': 116.483, 'train_steps_per_second': 7.28, 'total_flos': 2170071669325824.0, 'train_loss': 0.329936290107079, 'epoch': 3.0})

### Evaluate and Push to Hub

In [None]:
# Run evaluation on test set
final_metrics = trainer.evaluate()
print("Final Evaluation metrics:")
print(final_metrics)

Final Evaluation metrics:
{'eval_loss': 0.14816902577877045, 'eval_accuracy': 0.9517601043024772, 'eval_precision': 0.9514121408446373, 'eval_recall': 0.9517601043024772, 'eval_f1': 0.9514870982633408, 'eval_runtime': 20.5618, 'eval_samples_per_second': 261.115, 'eval_steps_per_second': 16.341, 'epoch': 3.0}


In [None]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...tegory-classifier/training_args.bin: 100%|##########| 5.78kB / 5.78kB            

  ...lassifier/adapter_model.safetensors: 100%|##########| 2.97MB / 2.97MB            

CommitInfo(commit_url='https://huggingface.co/junmeng-sf/distilbert-base-category-classifier/commit/183b9796bc465a501065c2003fb650273ffef309', commit_message='End of training', commit_description='', oid='183b9796bc465a501065c2003fb650273ffef309', pr_url=None, repo_url=RepoUrl('https://huggingface.co/junmeng-sf/distilbert-base-category-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='junmeng-sf/distilbert-base-category-classifier'), pr_revision=None, pr_num=None)

### Inference (Stage 1 + Stage 2 with Brand Tagging)

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

# --- Define Hub Model IDs ---
hub_id_stage1 = "sainoforce/distilbert-base-product-related"
hub_id_stage2 = "sainoforce/distilbert-base-category-classifier"
base_model_name = "distilbert-base-uncased"

# --- Load Stage 1 Model ---
print("Loading Stage 1: Product Relevance Classifier...")
# 1. Load the correct base model architecture with the right number of labels
base_model_stage1 = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2  # For binary classification (isProductRelated or not)
)
# 2. Load the LoRA adapter and merge it with the base model
model_stage1 = PeftModel.from_pretrained(base_model_stage1, hub_id_stage1)
# 3. Load the tokenizer associated with the model
tokenizer_stage1 = AutoTokenizer.from_pretrained(hub_id_stage1)
# 4. Now, create the pipeline with the fully-formed model and tokenizer
classifier_stage1 = pipeline("text-classification", model=model_stage1, tokenizer=tokenizer_stage1)

# --- Load Stage 2 Model ---
print("\nLoading Stage 2: Category Classifier...")
# 1. Load the base model, but this time with the correct number of category labels
base_model_stage2 = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=5 # For your 5 categories (makeup, skincare, etc.)
)
# 2. Load the LoRA adapter for the category classifier
model_stage2 = PeftModel.from_pretrained(base_model_stage2, hub_id_stage2)
# 3. Load its tokenizer
tokenizer_stage2 = AutoTokenizer.from_pretrained(hub_id_stage2)
# 4. Create the second pipeline
classifier_stage2 = pipeline("text-classification", model=model_stage2, tokenizer=tokenizer_stage2)


print("\nModels loaded successfully!")

Loading Stage 1: Product Relevance Classifier...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0



Loading Stage 2: Category Classifier...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0



Models loaded successfully!


In [None]:
# Define Brand Keywords and Extraction Function
brand_keywords = [
    "loreal", "l'oréal", "maybelline", "garnier", "nyx", "lancôme",
    "kiehl's", "cerave", "infallible", "elvive" # You can add product line names too!
]

def extract_brands(text, brands):
    mentioned_brands = []
    text_lower = text.lower()
    for brand in brands:
        if brand in text_lower:
            mentioned_brands.append(brand)
    # If the list is empty, return a default value
    return mentioned_brands if mentioned_brands else ["Brand Not Specified"]

In [None]:
stage2_id2label = {
    0: 'makeup',
    1: 'skincare',
    2: 'haircare',
    3: 'haircolor',
    4: 'fragrance'
}

# Create End-to-End Pipeline Function
def analyze_comment(comment_text):
    # Stage 1: Is the comment product-related?
    stage1_result = classifier_stage1(comment_text)[0]

    # Access the model's config to get the human-readable label
    label_id_stage1 = int(stage1_result['label'].split('_')[1])
    is_related_label = classifier_stage1.model.config.id2label[label_id_stage1]

    # The gatekeeper: if not product-related, we're done.
    if is_related_label == 'LABEL_0': # LABEL_0 is "Not Product Related"
        return {
            "comment": comment_text,
            "is_related": "No",
            "category": "N/A",
            "brands": "N/A",
            "relevance_score": stage1_result['score']
        }

    # If it IS product-related, proceed to Stage 2
    # Stage 2: What category is it?
    stage2_result = classifier_stage2(comment_text)[0]
    # The pipeline returns a string like 'LABEL_2'. We need to extract the number.
    label_id_stage2 = int(stage2_result['label'].split('_')[1])
    category = stage2_id2label[label_id_stage2]

    # Stage 3: Which brands are mentioned?
    brands = extract_brands(comment_text, brand_keywords)

    return {
        "comment": comment_text,
        "is_related": "Yes",
        "category": category,
        "brands": ", ".join(brands), # Join list into a clean string
        "relevance_score": stage1_result['score'],
        "category_score": stage2_result['score']
    }

In [None]:
# Test Sample Comments
comments = [
    "This foundation blends so smoothly, perfect for my skin tone.",
    "The lipstick color is beautiful but it fades too quickly.",
    "I love how this moisturizer keeps my skin hydrated all day.",
    "Is this serum safe for sensitive skin?",
    "The packaging is cute but the mascara dries out too fast.",
    "This sunscreen leaves a white cast, not good for darker skin tones.",
    "Best L'Oréal Elvive shampoo I’ve tried, makes my hair feel so soft.", # Added brands
    "I bought this CeraVe face wash and it really reduced my acne.", # Added brand
    "The fragrance of this body lotion is too strong for me.",
    "Does this eye cream actually reduce dark circles?",
    "Great video, thanks for the explanation!",
    "The background music is really calming.",
    "I laughed so hard at the blooper at the end.",
    "Can you make a tutorial about studying tips?",
    "Subscribed, looking forward to your next video!",
    "I tried this new thing and it was okay…" ,
    "The packaging looks nice but I haven’t used it yet.",
    "Not sure if this really works as advertised.",
    "My skin felt weird after using it…",
    "This color is amazing!" ,
    "I usually don’t like these things, but this is different." ,
    "Feels smooth, but not sure if I’ll buy again.",
    "It smells strong, not really my style." ,
    "The effect is subtle, but noticeable." ,
    "Everyone says this is good, but I don’t see the difference."
]

print("\n--- Analyzing Comments with the Full Pipeline ---")
full_results = [analyze_comment(c) for c in comments]

# This creates a much more informative and clean final output
df_pipeline_results = pd.DataFrame(full_results)

print(df_pipeline_results)

# Save your final results to a new CSV
output_path = "/content/drive/MyDrive/datathon/results/stage1_stage2_distilbert_results.csv"
df_pipeline_results.to_csv(output_path, index=False)

print(f"\nResults successfully saved to {output_path}")


--- Analyzing Comments with the Full Pipeline ---
                                              comment is_related   category  \
0   This foundation blends so smoothly, perfect fo...        Yes     makeup   
1   The lipstick color is beautiful but it fades t...        Yes     makeup   
2   I love how this moisturizer keeps my skin hydr...        Yes   haircare   
3              Is this serum safe for sensitive skin?        Yes   haircare   
4   The packaging is cute but the mascara dries ou...        Yes     makeup   
5   This sunscreen leaves a white cast, not good f...        Yes   haircare   
6   Best L'Oréal Elvive shampoo I’ve tried, makes ...        Yes   skincare   
7   I bought this CeraVe face wash and it really r...        Yes   haircare   
8   The fragrance of this body lotion is too stron...        Yes  haircolor   
9   Does this eye cream actually reduce dark circles?        Yes     makeup   
10           Great video, thanks for the explanation!         No        N/A   
1