Final code undersampled

In [12]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import DistilBertForSequenceClassification
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
data = pd.read_csv("final_dataset.csv")
print(f"Original dataset size: {data.shape[0]}")

# Map labels to integers
label_mapping = {"chit-chat": 0, "topic-specific": 1}
data["label"] = data["label"].map(label_mapping)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["label"], test_size=0.2, random_state=42, stratify=data["label"]
)

# Check dataset sizes before undersampling
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# Undersample the majority class using RandomUnderSampler
ros = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Apply undersampling to the train data (X_train, y_train)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

# Convert the NumPy array back into a DataFrame (to use .iloc)
X_train_resampled = pd.Series(X_train_resampled.flatten())  # Flatten the array and convert to a Series
y_train_resampled = pd.Series(y_train_resampled)

# Check if the entire dataset is utilized and correctly undersampled
print(f"Training set size after undersampling: {len(X_train_resampled)}")
print(f"Resampled class distribution: {pd.Series(y_train_resampled).value_counts()}")

# Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])  # Using .iloc() now because texts is back as a Series
        label = self.labels.iloc[idx]  # Using .iloc() because labels is back as a Series
        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),  # Remove the batch dimension
            "attention_mask": encoding["attention_mask"].squeeze(0),  # Remove the batch dimension
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Tokenizer and Datasets
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset = TextDataset(X_train_resampled, y_train_resampled, tokenizer, max_len=128)
test_dataset = TextDataset(X_test, y_test, tokenizer, max_len=128)

# Verify dataset sample
print("Sample from train_dataset:", train_dataset[0])

# Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()  # Convert logits to predictions
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


Original dataset size: 333694
Train size: 266955, Test size: 66739
Training set size after undersampling: 120878
Resampled class distribution: label
0    60439
1    60439
Name: count, dtype: int64
Sample from train_dataset: {'input_ids': tensor([  101,  3398,  2049,  2183,  2000,  2022, 12476,   999,  1045,  2228,
         2008,  1996,  2117,  2746,  2003, 23454,  2130,  2062,  2138,  2167,
         4420,  1998,  2148,  4420,  2024,  2085,  2012,  3521,  3574,  2008,
         2574,  2057,  2453,  2031,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0014,0.000272,0.999955,0.999801,1.0,0.999901
2,0.0007,0.000771,0.999895,0.999537,1.0,0.999768
3,0.0,0.000896,0.999895,0.999603,0.999934,0.999768


TrainOutput(global_step=22665, training_loss=0.001881267974590964, metrics={'train_runtime': 2746.3627, 'train_samples_per_second': 132.042, 'train_steps_per_second': 8.253, 'total_flos': 1.2009295661054976e+16, 'train_loss': 0.001881267974590964, 'epoch': 3.0})

In [13]:
# Evaluate the model
metrics = trainer.evaluate()  # This will evaluate the model on the test set
print("Evaluation Metrics:", metrics)

# Save the model
trainer.save_model("./distilbert_classifier_new")
tokenizer.save_pretrained("./distilbert_classifier_new")
print("Model saved to ./distilbert_classifier_new")

Evaluation Metrics: {'eval_loss': 0.0002716244198381901, 'eval_accuracy': 0.9999550487720823, 'eval_precision': 0.9998014954013101, 'eval_recall': 1.0, 'eval_f1': 0.9999007378486583, 'eval_runtime': 152.9597, 'eval_samples_per_second': 436.317, 'eval_steps_per_second': 27.275, 'epoch': 3.0}
Model saved to ./distilbert_classifier_new


In [14]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score

# Load the fine-tuned model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./distilbert_classifier_new')
tokenizer = DistilBertTokenizer.from_pretrained('./distilbert_classifier_new')

# Set model to evaluation mode
model.eval()

# Sample sentences for evaluation (3-5 samples)
sample_texts = [
    "What's your favorite hobby?",
    "Tell me a joke about a chicken crossing the road.",
    "What is the impact of artificial intelligence on jobs and automation?",
    "How does the stock market work?",
    "Can you tell me more about the Deepwater Horizon oil spill?"
]

# Manually set the expected labels (for example, let's assume you know the expected labels)
# 0 = "chit-chat", 1 = "topic-specific"
# These are just examples; update them based on your actual data labels
expected_labels = [0, 0, 1, 1, 1]  # Example true labels for the above samples

# Tokenize the inputs
inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)

# Perform prediction
with torch.no_grad():
    outputs = model(**inputs)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=-1).numpy()

# Calculate accuracy for this small test set
accuracy = accuracy_score(expected_labels, predictions)

# Print the results for each sample and the overall accuracy
for idx, text in enumerate(sample_texts):
    print(f"Sample {idx+1}: {text}")
    print(f"Predicted Label: {'chit-chat' if predictions[idx] == 0 else 'topic-specific'}")
    print(f"True Label: {'chit-chat' if expected_labels[idx] == 0 else 'topic-specific'}")
    print("-" * 50)

print(f"Overall accuracy on 5 samples: {accuracy * 100:.2f}%")


Sample 1: What's your favorite hobby?
Predicted Label: chit-chat
True Label: chit-chat
--------------------------------------------------
Sample 2: Tell me a joke about a chicken crossing the road.
Predicted Label: chit-chat
True Label: chit-chat
--------------------------------------------------
Sample 3: What is the impact of artificial intelligence on jobs and automation?
Predicted Label: chit-chat
True Label: topic-specific
--------------------------------------------------
Sample 4: How does the stock market work?
Predicted Label: chit-chat
True Label: topic-specific
--------------------------------------------------
Sample 5: Can you tell me more about the Deepwater Horizon oil spill?
Predicted Label: chit-chat
True Label: topic-specific
--------------------------------------------------
Overall accuracy on 5 samples: 40.00%
