In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
df = pd.read_csv("/combined_sentiment_data.csv")
sentiment_mapping = {'negative': 0, 'positive': 1}
df['label'] = df['sentiment'].map(sentiment_mapping)
dataset = Dataset.from_pandas(df)

# Split the dataset into training and testing sets (80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Train samples: {len(train_dataset)}, Evaluation samples: {len(eval_dataset)}")

In [None]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

columns_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_train_dataset = tokenized_train_dataset.select_columns(columns_to_keep)
tokenized_eval_dataset = tokenized_eval_dataset.select_columns(columns_to_keep)
print(f"Final Training Dataset Columns: {tokenized_train_dataset.column_names}")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    # Get the class with the highest probability
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

# Start Fine-Tuning
print("\nStarting fine-tuning...")
trainer.train()

In [None]:
print("Evaluating model...")
results = trainer.evaluate()
print(f"Final Evaluation Metrics: {results}")

In [None]:
SAVE_DIRECTORY = "./sentiment_model_assets"
trainer.model.save_pretrained(SAVE_DIRECTORY)
tokenizer.save_pretrained(SAVE_DIRECTORY)
print(f"DistilBERT Model and Tokenizer saved to the folder: {SAVE_DIRECTORY}")

In [None]:
FOLDER_NAME = "sentiment_model_assets"
ZIP_NAME = "distilbert_sentiment_model.zip"
!zip -r $ZIP_NAME $FOLDER_NAME
print(f"Folder successfully zipped as {ZIP_NAME}")

In [None]:
from google.colab import files
files.download(ZIP_NAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>