In [None]:
%pip install -q transformers datasets scikit-learn torch

Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 6.2/6.2 MB 54.4 MB/s eta 0:00:00
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.14.0
    Uninstalling sympy-1.14.0:
      Successfully uninstalled sympy-1.14.0
Successfully installed sympy-1.13.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
import numpy as np
import torch
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load your separated CSV files
train_df = pd.read_csv('../../../data/train_data.csv')
val_df = pd.read_csv('../../../data/val_data.csv')
test_df = pd.read_csv('../../../data/test_data.csv')

# Map 'Analysis' column (Negative, Neutral, Positive) to numeric labels
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

train_df['label'] = train_df['Analysis'].map(label_mapping)
val_df['label'] = val_df['Analysis'].map(label_mapping)
test_df['label'] = test_df['Analysis'].map(label_mapping)


In [3]:
# Ensure text is string and no NaN
train_df['text'] = train_df['text'].astype(str)
val_df['text'] = val_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)

train_df['text'] = train_df['text'].fillna("")
val_df['text'] = val_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrames to HuggingFace Dataset
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map: 100%|██████████| 9040/9040 [00:01<00:00, 6886.44 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 7199.38 examples/s]
Map: 100%|██████████| 1938/1938 [00:00<00:00, 7204.80 examples/s]


In [None]:
DistilBert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to GPU
DistilBert_model = DistilBert_model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, target_names=['Negative', 'Neutral', 'Positive'], output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'f1_macro': report['macro avg']['f1-score']
    }

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro"
)

trainer = Trainer(
    model=DistilBert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Initialize time for recording inference time
start = time.time()

# Train the model
history = trainer.train()

end = time.time()
training_time = end - start

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.5998,0.371406,0.858028,0.84394
2,0.3038,0.321029,0.889004,0.879001
3,0.1875,0.383414,0.894166,0.883293


TrainOutput(global_step=1695, training_loss=0.3395059006052383, metrics={'train_runtime': 1984.3829, 'train_samples_per_second': 13.667, 'train_steps_per_second': 0.854, 'total_flos': 898144979742720.0, 'train_loss': 0.3395059006052383, 'epoch': 3.0})

In [None]:
# Save computation time to CSV
time_df = pd.DataFrame({"distilbert_TrainingTime (s)": [training_time]})
time_df.to_csv("computation_time_distilbert.csv", index=False)

print("Training time: ", training_time, "seconds")
print("Training time is saved")

In [9]:
trainer.save_model("./saved_model_distilbert")
tokenizer.save_pretrained("./saved_model_distilbert")

('./saved_model_distilbert\\tokenizer_config.json',
 './saved_model_distilbert\\special_tokens_map.json',
 './saved_model_distilbert\\vocab.txt',
 './saved_model_distilbert\\added_tokens.json',
 './saved_model_distilbert\\tokenizer.json')

In [None]:
from matplotlib import pyplot as plt 

# Accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title("Accuracy Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("accuracy_epochs_distilbert.png")
plt.show()

In [None]:
# Loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.savefig("loss_epochs_distilbert.png")
plt.show()