<a href="https://colab.research.google.com/github/botvoodoo/NLP/blob/main/nlp_raytune_hf" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install wandb
!pip install torch --upgrade
!pip install datasets evaluate
!pip install xformers
!pip install ray tune
!pip install kaggle --upgrade

In [None]:
!mkdir -p ~/.kaggle
!mv /content/sample_data/kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list

In [None]:
!kaggle competitions download -c nlp-getting-started

In [None]:
import os
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import ray
from ray import tune
import pandas as pd
from datasets import Dataset, load_dataset, load_metric
from huggingface_hub import notebook_login
import wandb
import random
import ray

In [None]:
# Download helper functions script

#Credit for helper functions: https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [None]:
# Unzipping Data
unzip_data("/content/nlp-getting-started.zip")

# Load data from the CSV file into a Pandas DataFrame
df = pd.read_csv("/content/train.csv")
df = df.drop(['id', 'keyword', 'location'], axis=1) # Drop unnecessary columns
df.rename(columns={'text': 'text', 'target': 'label'}, inplace=True) # Rename columns
df.head()

In [None]:
#Random Seed
random_seed = (13)

#Function Definition (train_model): Define a function to train a BERT model for sequence classification.

#Configuration Extraction: Extract various hyperparameters and configurations like learning rate, batch size, weight decay, number of epochs, warmup steps...
#rate scheduler type, and dropout rate from the passed config dictionary.

def train_model(config, checkpoint_dir=None):
    df = config["data_df"]
    learning_rate = config["learning_rate"]
    batch_size = config["per_device_train_batch_size"]
    weight_decay = config["weight_decay"]
    num_epochs = config["num_train_epochs"]
    warmup_steps = config["warmup_steps"]
    lr_scheduler_type = config["lr_scheduler_type"]
    dropout_rate = config["dropout_rate"]

#Checkpoint Handling: If there's a checkpoint directory provided, load the checkpoint state to continue training from where it left off. If not, start from epoch 0.

    if checkpoint_dir:
        checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
        if os.path.exists(checkpoint_path):
            checkpoint_state = torch.load(checkpoint_path)
            start_epoch = checkpoint_state["epoch"]
        else:
            start_epoch = 0
    else:
        start_epoch = 0

#Data Splitting and Tokenization: Split the dataset into training and validation parts and tokenize the data using BERT tokenizer.

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=random_seed)

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_data(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

#Data Transformation: Convert the tokenized data into datasets compatible with PyTorch, and set the format for the required columns.

    train_dataset = Dataset.from_pandas(train_df).map(tokenize_data)
    val_dataset = Dataset.from_pandas(val_df).map(tokenize_data)

    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

#Data Loading: Create DataLoaders for training and validation datasets to handle batching, shuffling, and parallel loading.

    trainloader = DataLoader(train_dataset, batch_size=int(config["per_device_train_batch_size"]), shuffle=True, num_workers=2)
    valloader = DataLoader(val_dataset, batch_size=int(config["per_device_train_batch_size"]), shuffle=True, num_workers=2)

#Model and Loss Function Initialization: Instantiate a BERT model for sequence classification with two labels and define a cross-entropy loss function.

    criterion = nn.CrossEntropyLoss()

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

#Checkpoint Model Loading (Optional): If a checkpoint exists, load the saved model state.

    if checkpoint_dir:
        model.load_state_dict(checkpoint_state["model_state_dict"])

#Optimizer Initialization: Define the Adam optimizer with the given learning rate and weight decay.

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#Device Configuration: Determine the device to use (either GPU or CPU) and move the model to the chosen device.

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

#Training Loop: Perform the training loop through a specified number of epochs.

#Forward pass through model. Calculation of loss.

    for epoch in range(start_epoch, 10):
        model.train()
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs = data["input_ids"].to(device)
            attention_masks = data["attention_mask"].to(device)
            labels = data["label"].to(device)

#Update model parameters using the optimizer. Backward pass to compute gradients.

            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

#Print running loss every 2000 iterations.

            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

#Validation Loop: Evaluate the model on the validation set, compute validation loss and accuracy.

# Validation

        model.eval()
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        with torch.no_grad():
            for i, data in enumerate(valloader, 0):
                inputs = data["input_ids"].to(device)
                attention_masks = data["attention_mask"].to(device)
                labels = data["label"].to(device)

                outputs = model(inputs, attention_mask=attention_masks)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                val_steps += 1

                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

#Checkpoint Saving: Save the model and optimizer states at each epoch.

# Save checkpoint
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch
            }, checkpoint_path)

# Report to Ray Tune: Report validation loss and accuracy to Ray Tune for hyperparameter tuning.

        val_accuracy = correct / total * 100
        tune.report(val_loss=val_loss / val_steps, val_accuracy=val_accuracy)



print("Finished Training")


In [None]:
ray.init(ignore_reinit_error=True)

config = {
    "data_df": df,
    "learning_rate": tune.loguniform(1e-5, 1e-1),
    # Other hyperparameters
}

analysis = tune.run(
    train_model,
    config=config,
    stop={"val_loss": 0.01},
    resources_per_trial={"cpu": 1, "gpu": 1},
    num_samples=10
)
