In [9]:
import pandas as pd
import torch
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.optim as optim

In [4]:
# The 12 Labels defined for the 37-class dataset
LABEL_COLUMNS = [
    'Power_Failure',
    'Battery_Charging',
    'Display_Visual',
    'Audio_Sound',
    'Overheating_Thermal',
    'Connectivity_Signal',
    'Water_Liquid_Damage',
    'Mechanical_Motor',
    'Input_Controls',
    'Software_Error',
    'Data_Storage',
    'Sensor_Accuracy'
]

CSV_FILE = r"electronics_complaints_1200.csv"
MODEL_NAME = 'distilbert-base-uncased' 
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 2e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using device: {DEVICE}")

Using device: cuda


In [5]:
class TextAugmenter:
    """
    Applies simple text augmentation techniques:
    1. Random Swap: Swaps two words.
    2. Random Deletion: Deletes words with probability p.
    """
    def __init__(self):
        pass

    def random_swap(self, sentence, n=1):
        words = sentence.split()
        if len(words) < 2: return sentence
        for _ in range(n):
            idx1, idx2 = random.sample(range(len(words)), 2)
            words[idx1], words[idx2] = words[idx2], words[idx1]
        return ' '.join(words)

    def random_deletion(self, sentence, p=0.15):
        words = sentence.split()
        if len(words) <= 1: return sentence
        new_words = [w for w in words if random.uniform(0, 1) > p]
        # Make sure we don't delete everything
        if len(new_words) == 0: 
            return words[random.randint(0, len(words)-1)]
        return ' '.join(new_words)

    def augment(self, text):
        # 30% chance to swap, 30% chance to delete, 40% keep original
        r = random.uniform(0, 1)
        if r < 0.3:
            return self.random_swap(text)
        elif r < 0.6:
            return self.random_deletion(text)
        return text

In [6]:
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augment=False):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment
        self.augmenter = TextAugmenter()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels.iloc[item].values # Convert row to numpy array
        
        # Apply Augmentation only if enabled (Training set)
        if self.augment:
            text = self.augmenter.augment(text)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [7]:
print("Loading data...")
df = pd.read_csv(CSV_FILE)

# X is text, Y is the 12 columns
X = df['text']
Y = df[LABEL_COLUMNS]

# Step 1: Split into Train (80%) and Temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 2: Split Temp into Valid (10%) and Test (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Data Split: Train={len(X_train)}, Valid={len(X_val)}, Test={len(X_test)}")

Loading data...
Data Split: Train=1023, Valid=128, Test=128


In [10]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Enable augmentation only for Train
train_dataset = ComplaintDataset(X_train, y_train, tokenizer, MAX_LEN, augment=True)
val_dataset = ComplaintDataset(X_val, y_val, tokenizer, MAX_LEN, augment=False)
test_dataset = ComplaintDataset(X_test, y_test, tokenizer, MAX_LEN, augment=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(LABEL_COLUMNS),
    problem_type="multi_label_classification"
)
model = model.to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def train_epoch(model, loader, optimizer, scheduler, device):
    model = model.train()
    losses = []
    for d in loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return np.mean(losses)

def eval_model(model, loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    num_samples = 0
    
    with torch.no_grad():
        for d in loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            losses.append(outputs.loss.item())
    
    return np.mean(losses)

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, DEVICE)
    val_loss = eval_model(model, val_loader, DEVICE)
    print(f"Epoch {epoch + 1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch 1/20 | Train Loss: 0.4165 | Val Loss: 0.3453
Epoch 2/20 | Train Loss: 0.2951 | Val Loss: 0.2163
Epoch 3/20 | Train Loss: 0.1739 | Val Loss: 0.1182
Epoch 4/20 | Train Loss: 0.1066 | Val Loss: 0.0757
Epoch 5/20 | Train Loss: 0.0759 | Val Loss: 0.0564
Epoch 6/20 | Train Loss: 0.0578 | Val Loss: 0.0400
Epoch 7/20 | Train Loss: 0.0459 | Val Loss: 0.0318
Epoch 8/20 | Train Loss: 0.0387 | Val Loss: 0.0259
Epoch 9/20 | Train Loss: 0.0319 | Val Loss: 0.0204
Epoch 10/20 | Train Loss: 0.0286 | Val Loss: 0.0186
Epoch 11/20 | Train Loss: 0.0228 | Val Loss: 0.0167
Epoch 12/20 | Train Loss: 0.0223 | Val Loss: 0.0146
Epoch 13/20 | Train Loss: 0.0214 | Val Loss: 0.0127
Epoch 14/20 | Train Loss: 0.0191 | Val Loss: 0.0118
Epoch 15/20 | Train Loss: 0.0200 | Val Loss: 0.0113
Epoch 16/20 | Train Loss: 0.0193 | Val Loss: 0.0110
Epoch 17/20 | Train Loss: 0.0161 | Val Loss: 0.0103
Epoch 18/20 | Train Loss: 0.0148 | Val Loss: 0.0097
Epoch 19/20 | Train Loss: 0.0156 | Val Loss: 0.0096
Epoch 20/20 | Train L

In [12]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for d in test_loader:
        input_ids = d["input_ids"].to(DEVICE)
        attention_mask = d["attention_mask"].to(DEVICE)
        labels = d["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask)
        # Convert logits to probs
        probs = torch.sigmoid(outputs.logits)
        # Threshold at 0.5
        preds = (probs > 0.5).int()
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(np.array(all_labels), np.array(all_preds), target_names=LABEL_COLUMNS, zero_division=0))

# ==========================================
# 9. SAVE MODEL
# ==========================================
model.save_pretrained("./electronics_nlp_model")
tokenizer.save_pretrained("./electronics_nlp_model")
print("Model saved to ./electronics_nlp_model")

                     precision    recall  f1-score   support

      Power_Failure       1.00      1.00      1.00        10
   Battery_Charging       1.00      1.00      1.00        15
     Display_Visual       1.00      1.00      1.00        14
        Audio_Sound       1.00      1.00      1.00        19
Overheating_Thermal       1.00      1.00      1.00        14
Connectivity_Signal       1.00      1.00      1.00        17
Water_Liquid_Damage       1.00      1.00      1.00        15
   Mechanical_Motor       1.00      1.00      1.00        11
     Input_Controls       1.00      1.00      1.00        12
     Software_Error       1.00      1.00      1.00        13
       Data_Storage       1.00      1.00      1.00        12
    Sensor_Accuracy       1.00      1.00      1.00        16

          micro avg       1.00      1.00      1.00       168
          macro avg       1.00      1.00      1.00       168
       weighted avg       1.00      1.00      1.00       168
        samples avg   