In [14]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

In [15]:
# Load and encode labels
df = pd.read_csv("../data/feedback_cleaned.csv")
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Preview label mapping
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Map:", label_map)

# Rename columns for Hugging Face
df = df[['clean_text', 'label_encoded']].rename(columns={'clean_text': 'text', 'label_encoded': 'label'})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)


Label Map: {'complaint': np.int64(0), 'feature_request': np.int64(1), 'praise': np.int64(2), 'question': np.int64(3), 'technical_issue': np.int64(4)}


In [16]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [17]:
num_labels = len(df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=num_labels
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
num_labels = len(df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=num_labels
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
import sys
print(sys.executable)


d:\DS-Project\email-feedback-classifier\env\Scripts\python.exe


In [20]:
import transformers
print(transformers.__version__)


4.52.4


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Added to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
)
