In [11]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

In [2]:
# Load and encode labels
df = pd.read_csv("../data/feedback_cleaned.csv")
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Preview label mapping
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Map:", label_map)

# Rename columns for Hugging Face
df = df[['clean_text', 'label_encoded']].rename(columns={'clean_text': 'text', 'label_encoded': 'label'})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)


Label Map: {'complaint': np.int64(0), 'feature_request': np.int64(1), 'praise': np.int64(2), 'question': np.int64(3), 'technical_issue': np.int64(4)}


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [7]:
num_labels = len(df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=num_labels
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import sys
print(sys.executable)


D:\DS-Project\email-feedback-classifier\env\Scripts\python.exe


In [6]:
import transformers
print(transformers.__version__)


4.52.4


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Added to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
)


In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics 
)


  trainer = Trainer(


In [16]:
eval_result = trainer.evaluate()
print("Evaluation Results:", eval_result)


Evaluation Results: {'eval_loss': 1.6061993837356567, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.2, 'eval_precision': 0.33722222222222226, 'eval_recall': 0.2, 'eval_f1': 0.15989648033126294, 'eval_runtime': 18.5696, 'eval_samples_per_second': 1.616, 'eval_steps_per_second': 0.215}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
# Create a folder and save model + tokenizer
model_path = "distilbert-email-feedback-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


('distilbert-email-feedback-model\\tokenizer_config.json',
 'distilbert-email-feedback-model\\special_tokens_map.json',
 'distilbert-email-feedback-model\\vocab.txt',
 'distilbert-email-feedback-model\\added_tokens.json',
 'distilbert-email-feedback-model\\tokenizer.json')

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load saved model and tokenizer
model_path = "distilbert-email-feedback-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set model to eval mode
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
def predict_feedback(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return le.inverse_transform([predicted_class])[0]

# Example
predict_feedback("The app keeps crashing when I upload a file.")


'question'

In [19]:
# Create a DataFrame with predictions
feedback_texts = df["text"].tolist()
predicted_labels = []

for text in feedback_texts:
    label = predict_feedback(text)
    predicted_labels.append(label)

# Save results
df["predicted_label"] = predicted_labels
df.to_csv("predicted_feedback.csv", index=False)
