<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/notebooks/deberta_large_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

In [None]:
!pip uninstall -y transformers
!pip install transformers==4.27.0
!pip install datasets scikit-learn matplotlib
!pip install fsspec==2024.10.0

# Training + validation: 1040

In [None]:

import torch
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
from sklearn.model_selection import train_test_split

class TimeTrackerCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        # Record the start time at the beginning of each epoch
        self.start_time = time.time()

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        # Record the end time and calculate the elapsed time
        elapsed_time = time.time() - self.start_time
        print(f"Epoch {state.epoch} training time: {elapsed_time:.2f} seconds")

# 1. Load the single CSV file using pandas
df = pd.read_csv('/content/drive/MyDrive/world-inflation/data/reddit/production/main-prod-1040.csv', sep=',')

# 2. Split into training and validation sets with a 75:25 ratio
train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)

# 3. Convert pandas DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 2. Initialize the tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')

# 3. Tokenize the dataset
def tokenize_function(examples):
    # Tokenize the input texts with padding and truncation
    tokenized = tokenizer(examples['body'], padding="max_length", truncation=True, max_length=512)
    tokenized['labels'] = examples['inflation']
    return tokenized

# Apply tokenization to train and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)

# 4. Define the evaluation metrics (Accuracy, Precision, Recall, F1)
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 5. Set up the training arguments
# Checking for transformers version and using appropriate arguments
import transformers

# Print version for debugging
print(f"Transformers version: {transformers.__version__}")

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir="/content/drive/MyDrive/world-inflation/data/model/deberta-large-fine-tuning-1040",
    logging_dir="/content/drive/MyDrive/world-inflation/data/model/deberta-large-fine-tuning-1040/logs",

    # Logging and monitoring
    logging_steps=20,

    # Core training parameters - optimized for 1,040 samples
    learning_rate=2e-5,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,

    # Regularization
    weight_decay=0.01,
    # warmup_ratio=0.1,

    # Model selection and saving
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,

    # Training stability and efficiency
    seed=42,

    # Naming and reporting
    run_name="deberta-inflation-1040",
    lr_scheduler_type='linear',
    report_to="none",
)

# 6. Define the Trainer
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=3)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Using the predefined validation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TimeTrackerCallback()]  # Fixed: instantiate the callback class
)

In [None]:
# 7. Start the training process
trainer.train()

# 8. Plot the training and validation losses
logs = trainer.state.log_history
train_losses = [log['loss'] for log in logs if 'loss' in log]
eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

# 9 Create a plot for losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss', marker='o')
plt.plot(eval_losses, label='Validation Loss', marker='x')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.show()

In [None]:
# 7. Start the training process
trainer.train()

# 8. Plot the training and validation losses
logs = trainer.state.log_history
train_losses = [log['loss'] for log in logs if 'loss' in log]
eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

# 9 Create a plot for losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss', marker='o')
plt.plot(eval_losses, label='Validation Loss', marker='x')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.show()

# Training + validation: 65

In [None]:

import torch
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
from sklearn.model_selection import train_test_split

class TimeTrackerCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        # Record the start time at the beginning of each epoch
        self.start_time = time.time()

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        # Record the end time and calculate the elapsed time
        elapsed_time = time.time() - self.start_time
        print(f"Epoch {state.epoch} training time: {elapsed_time:.2f} seconds")

# 1. Load the single CSV file using pandas
df = pd.read_csv('/content/drive/MyDrive/world-inflation/data/reddit/production/main-prod-65.csv', sep=',')

# 2. Split into training and validation sets with a 75:25 ratio
train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)

# 3. Convert pandas DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 2. Initialize the tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')

# 3. Tokenize the dataset
def tokenize_function(examples):
    # Tokenize the input texts with padding and truncation
    tokenized = tokenizer(examples['body'], padding="max_length", truncation=True, max_length=512)
    tokenized['labels'] = examples['inflation']
    return tokenized

# Apply tokenization to train and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)

# 4. Define the evaluation metrics (Accuracy, Precision, Recall, F1)
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 5. Set up the training arguments
# Checking for transformers version and using appropriate arguments
import transformers

# Print version for debugging
print(f"Transformers version: {transformers.__version__}")

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir="/content/drive/MyDrive/world-inflation/data/model/deberta-large-fine-tuning-65",
    logging_dir="/content/drive/MyDrive/world-inflation/data/model/deberta-large-fine-tuning-65/logs",

    # Logging and monitoring
    logging_steps=10,

    # Core training parameters - optimized for 1,040 samples
    learning_rate=2e-5,
    num_train_epochs=8,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,

    # Regularization
    weight_decay=0.01,
    # warmup_ratio=0.1,

    # Model selection and saving
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Training stability and efficiency
    seed=42,

    # Naming and reporting
    run_name="deberta-inflation-65",
    lr_scheduler_type='linear',
    report_to="none",
)

# 6. Define the Trainer
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=3)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Using the predefined validation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TimeTrackerCallback()]  # Fixed: instantiate the callback class
)

In [None]:
# 7. Start the training process
trainer.train()

# 8. Plot the training and validation losses
logs = trainer.state.log_history
train_losses = [log['loss'] for log in logs if 'loss' in log]
eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

# 9 Create a plot for losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss', marker='o')
plt.plot(eval_losses, label='Validation Loss', marker='x')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.show()