### Load Data

In [1]:
import pandas as pd

dataset_loc = 'dataset.csv'

train_df = pd.read_csv(dataset_loc)
train_df.head()

Unnamed: 0,id,created_on,title,description,tag
0,6,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,computer-vision
1,7,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,computer-vision
2,9,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",other
3,15,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,other
4,25,2020-03-07 23:04:31,AttentionWalk,"A PyTorch Implementation of ""Watch Your Step: ...",other


In [2]:
# Unique labels

tags = train_df.tag.unique().tolist()
tags

['computer-vision', 'other', 'natural-language-processing', 'mlops']

In [3]:
# Load inference dataset

holdout_dataset = 'holdout.csv'

test_df = pd.read_csv(holdout_dataset)
test_df.head()



Unnamed: 0,id,created_on,title,description,tag
0,19,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,other
1,26,2020-03-07 23:11:58,Graph Wavelet Neural Network,"A PyTorch implementation of ""Graph Wavelet Neu...",other
2,44,2020-03-08 00:32:58,Capsule Graph Neural Network,"A PyTorch implementation of ""Capsule Graph Neu...",other
3,80,2020-03-20 05:59:32,NeRF: Neural Radiance Fields,Representing scenes as neural radiance fields ...,computer-vision
4,84,2020-03-20 15:18:43,Mention Classifier,Category prediction model\r\nThis repo contain...,natural-language-processing


### Utilities


In [4]:
import json
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
from sklearn.metrics import precision_recall_fscore_support
import time
from tqdm import tqdm

In [8]:
! pip install -q openai

You should consider upgrading via the 'D:\MadeWithMl\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [9]:
from openai import OpenAI

system_content = "you only answer in rhymes"  # system content (behavior)
assistant_content = ""  # assistant content (context)
user_content = "how are you"  # user content (message)

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="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",
)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[        
        {"role": "system", "content": system_content},
        {"role": "assisstant" , "content": assistant_content},
        {"role": "user", "content": user_content},
    ],    
)
print(response['choices'][0].message.content)


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

### Create a function to predict tags

In [None]:
model = "gpt-3.5-turbo-0613"
system_context = f"""
    You are a NLP prediction service that predicts the label given an input's title and description.
    You must choose between one of the following labels for each input: {tags}.
    Only respond with the label name and nothing else.
    """
assistant_content = ""
user_context = "Transfer learning with transformers: Using transformers for transfer learning on text classification tasks."

def get_tag(model, system_content="", assistant_content="", user_content=""):
    try:

        client = OpenAI(
            # defaults to os.environ.get("OPENAI_API_KEY")
            api_key="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",
        )
        # Get response from OpenAI
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[        
                {"role": "system", "content": system_content},
                {"role": "assisstant" , "content": assistant_content},
                {"role": "user", "content": user_content},
        ],    
        )
        predicted_tag = response["choices"][0].message.content
        return predicted_tag

    except (OpenAI.error.ServiceUnavailableError, OpenAI.error.APIError) as e:
        return None
    
tag = get_tag(model=model, system_content=system_context, assistant_content=assistant_content, user_content=user_context)
print (tag)

### Create a function to predict tags for a list of inputs

In [None]:
samples = test_df[["title", "description"]].to_dict(orient="records")[:3]

def predict_tags(inputs, model, system_content='', assistant_content= ''):
    y_pred = []

    for item in tqdm(inputs):    
    # Convert item dict to string
        user_content = str(item)

        # Get prediction
        predicted_tag = get_tag(
            model=model, system_content=system_content,
            assistant_content=assistant_content, user_content=user_content)
    
    y_pred.append(predicted_tag)
    return y_pred

# Get predictions for a list of inputs
predict_tags(inputs=samples, model=model, system_content=system_context)


### Create a function to clean our predicted tags

In [18]:
def clean_tags(y_pred, tags, default = 'other'):
    for i, tag in enumerate(y_pred):

        if tag not in tags:
            y_pred[i] = default
        if tag.startswith("'") and tag.endswith("'"):
            y_pred[i] = tag[1:-1]
    
    return y_pred

In [19]:
def plot_tag_dist(y_true, y_pred):
    # Distribution of tags
    true_tag_freq = dict(Counter(y_true))
    pred_tag_freq = dict(Counter(y_pred))
    df_true = pd.DataFrame({"tag": list(true_tag_freq.keys()), "freq": list(true_tag_freq.values()), "source": "true"})
    df_pred = pd.DataFrame({"tag": list(pred_tag_freq.keys()), "freq": list(pred_tag_freq.values()), "source": "pred"})
    df = pd.concat([df_true, df_pred], ignore_index=True)

    # Plot
    plt.figure(figsize=(10, 3))
    plt.title("Tag distribution", fontsize=14)
    ax = sns.barplot(x="tag", y="freq", hue="source", data=df)
    ax.set_xticklabels(list(true_tag_freq.keys()), rotation=0, fontsize=8)
    plt.legend()
    plt.show()

### Create a function to combine all the utiitites above

In [22]:
def evaluate(test_df, model, system_content, tags, assistant_content=""):
    # Predictions
    y_test = test_df.tag.to_list()
    test_samples = test_df[["title", "description"]].to_dict(orient="records")
    y_pred = predict_tags(
        inputs=test_samples, model=model,
        system_content=system_content, assistant_content=assistant_content)
    y_pred = clean_tags(y_pred=y_pred, tags=tags)

    # Performance
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    print(json.dumps(performance, indent=2))
    plot_tag_dist(y_true=y_test, y_pred=y_pred)
    return y_pred, performance

### Zero Shot learning

In [None]:
y_pred = {"zero_shot": {}, "few_shot": {}}
performance = {"zero_shot": {}, "few_shot": {}}

system_content = f"""
    You are a NLP prediction service that predicts the label given an input's title and description.
    You must choose between one of the following labels for each input: {tags}.
    Only respond with the label name and nothing else.
    """

# Zero-shot with GPT 3.5

method = "zero_shot"
model = "gpt-3.5-turbo-0613"
y_pred[method][model], performance[method][model] = evaluate(
    test_df=test_df, model=model, system_content=system_content, tags=tags)

### Few-Shot learning

In [None]:
# Create additional context with few samples from each class

num_samples = 2
additional_context = []
cols_to_keep = ["title", "description", "tag"]
for tag in tags:
    samples = train_df[cols_to_keep][train_df.tag == tag][:num_samples].to_dict(orient="records")
    additional_context.extend(samples)

assistant_content = f"""Here are some examples with the correct labels: {additional_context}"""

# Few-shot with GPT 3.5

method = "few_shot"
model = "gpt-3.5-turbo-0613"
y_pred[method][model], performance[method][model] = evaluate(
    test_df=test_df, model=model, system_content=system_content,
    assistant_content=assistant_content, tags=tags)

### Setup

In [5]:
import os
import random
import torch
from ray.data.preprocessor import Preprocessor
import numpy as np
import ray

In [6]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)
    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
    os.environ["PYTHONHASHSEED"] = str(seed)

In [7]:
def load_data(num_samples=None):
    ds = ray.data.read_csv('dataset.csv')
    ds = ds.random_shuffle(seed=1234)
    ds = ray.data.from_items(ds.take(num_samples) if num_samples else ds)
    return ds

In [8]:
from madewithml.data import preprocess

class customPreprocessor(Preprocessor):
    def _fit(self, ds):
        tags = ds.unique(column='tag')
        self.class_to_index = {tag:i for i, tag in enumerate(tags)}
        self.index_to_class = {k:v for v, k in self.class_to_index.items()}
    
    def _transform_pandas(self, batch):
        return preprocess(batch, class_to_index=self.class_to_index)



### Model

In [9]:
from torch import nn
from transformers import BertModel

llm = BertModel.from_pretrained('allenai/scibert_scivocab_uncased', return_dict= False)
embedding_dim = llm.config.hidden_size

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', returned_dict = False)

# Sample

text = "Transfer learning with transformers for text classification."
batch = tokenizer([text], return_tensors="np", padding="longest")
batch = {k:torch.tensor(v) for k,v in batch.items()}  # convert to torch tensors
seq, pool = llm(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])

In [11]:
class fineTunedLLM(nn.Module):
    def __init__(self, llm, num_classes, embedding_dim, drop_out):
        super(fineTunedLLM, self).__init__()
        self.llm = llm
        self.dropout = nn.Dropout(drop_out)
        # Use the length of num_classes as the number of output features
        self.fc1 = nn.Linear(embedding_dim, len(num_classes))
    
    def forward(self, batch):
        ids, mask = batch['ids'], batch['mask']
        seq, pool = self.llm(input_ids=ids, attention_mask=mask)
        z = self.dropout(pool)
        z = self.fc1(z)
        return z
    
    @torch.inference_mode()
    def predict(self, batch):
        self.eval()
        logits = self(batch)
        y_pred = torch.argmax(logits, dim=1).cpu().numpy()
        return y_pred
    
    @torch.inference_mode()
    def predict_proba(self, batch):
        self.eval()
        logits = self(batch)
        proba = torch.softmax(logits).cpu().numpy()
        return proba



In [12]:
# Initialize model

num_classes = train_df.tag.unique().tolist()
model = fineTunedLLM(llm=llm, drop_out=0.5, embedding_dim=embedding_dim, num_classes=num_classes)
print(model.named_parameters)


<bound method Module.named_parameters of fineTunedLLM(
  (llm): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

### Batching

In [13]:
from ray.train.torch import get_device

def pad_array(arr, dtype = np.int32):
    max_len = max(len(row) for row in arr)
    pad_array = np.zeros((arr.shape[0], max_len), dtype=dtype)

    for i, row in enumerate(arr):
        pad_array[i][:len(row)] = row
        return pad_array

def collate_fn(batch):
    batch['ids'] = pad_array(batch['ids'])
    batch['masks'] = pad_array(batch['masks'])
    dtypes = {'ids': torch.int32, 'masks': torch.int32, 'targets': torch.int64}
    tensor_batch = {}

    for key, array in batch.items():
        tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
    return tensor_batch

In [14]:
from madewithml.data import stratify_split

ds = ray.data.read_csv('dataset.csv')
ds = ds.random_shuffle(seed=42)



TEST_SIZE = 0.2
train_ds, val_ds = stratify_split(ds, stratify='tag', test_size= TEST_SIZE)
# Mapping
tags = train_ds.unique(column='tag')
class_to_idx = {tag:i for i, tag in enumerate(tags)}

sample_ds = train_ds.map_batches(preprocess, fn_kwargs={"class_to_index": class_to_idx}, batch_format="pandas")

sample_batch = sample_ds.take_batch(batch_size=128)
collate_fn(sample_batch)


Unable to poll TPU GCE metadata: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/accelerator-type (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001660287AD00>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Failed to detect number of TPUs: [WinError 3] The system cannot find the path specified: '/dev/vfio'


2023-11-13 14:09:24,835	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-11-13 14:09:35,371	INFO read_api.py:406 -- To satisfy the requested parallelism of 8, each read task output is split into 8 smaller blocks.
2023-11-13 14:09:35,443	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(8)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]
2023-11-13 14:09:35,443	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-13 14:09:35,451	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/64 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

2023-11-13 14:09:41,939	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(8)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]
2023-11-13 14:09:41,947	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-11-13 14:09:41,947	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/64 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/64 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/64 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/64 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-11-13 14:09:58,215	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(8)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-11-13 14:09:58,223	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-11-13 14:09:58,223	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/64 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/64 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/64 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/64 [00:00<?, ?it/s]

- Aggregate 11:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 12:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 13:   0%|          | 0/64 [00:00<?, ?it/s]

Running 0:   0%|          | 0/64 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/8 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-11-13 14:10:02,962	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(8)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)] -> LimitOperator[limit=128]
2023-11-13 14:10:02,970	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-11-13 14:10:02,970	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/64 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/64 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/64 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/64 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/64 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/8 [00:00<?, ?it/s]

  tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())


{'ids': tensor([[  102,  2848, 11695,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         ...,
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0]], dtype=torch.int32),
 'masks': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32),
 'targets': tensor([2, 3, 3, 2, 0, 0, 2, 0, 1, 3, 0, 2, 0, 2, 1, 2, 2, 2, 3, 2, 0, 0, 2, 0,
         0, 2, 2, 0, 3, 2, 0, 1, 1, 0, 2, 1, 0, 3, 2, 0, 0, 0, 0, 3, 2, 0, 2, 0,
         0, 2, 0, 2, 2, 0, 0, 2, 0, 3, 1, 2, 0, 2, 2, 2, 3, 2, 3, 3, 2, 1, 0, 0,
         2, 0, 0, 2, 2, 2, 2, 3, 3, 2, 0, 0, 2, 0, 1, 3, 0, 2, 0, 2, 1, 2, 2, 2,
         3, 2, 

### Utilitites

In [22]:
from ray.air import Checkpoint, session
from ray.air.config import CheckpointConfig, DatasetConfig, RunConfig, ScalingConfig
import ray.train as train
from ray.train.torch import TorchCheckpoint, TorchTrainer
import torch.nn.functional as F

In [16]:
def train_step(ds, batch_size, model, num_classes, loss_fn, optimizer):
    """Train step."""
    model.train()
    loss = 0.0
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    for i, batch in enumerate(ds_generator):
        optimizer.zero_grad()  # reset gradients
        z = model(batch)  # forward pass
        targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
        J = loss_fn(z, targets)  # define loss
        J.backward()  # backward pass
        optimizer.step()  # update weights
        loss += (J.detach().item() - loss) / (i + 1)  # cumulative loss
    return loss

In [17]:
def eval_step(ds, batch_size, model, num_classes, loss_fn):
    """Eval step."""
    model.eval()
    loss = 0.0
    y_trues, y_preds = [], []
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    with torch.inference_mode():
        for i, batch in enumerate(ds_generator):
            z = model(batch)
            targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
            J = loss_fn(z, targets).item()
            loss += (J - loss) / (i + 1)
            y_trues.extend(batch["targets"].cpu().numpy())
            y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)

In [20]:
# Training loop
def train_loop_per_worker(config):
    # Hyperparameters
    dropout_p = config["dropout_p"]
    lr = config["lr"]
    lr_factor = config["lr_factor"]
    lr_patience = config["lr_patience"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_classes = config["num_classes"]

    # Get datasets
    set_seed()
    train_ds = session.get_dataset_shard("train")
    val_ds = session.get_dataset_shard("val")

    # Model
    llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    model = fineTunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
    model = train.torch.prepare_model(model)

    # Training components
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)

    # Training
    batch_size_per_worker = batch_size // session.get_world_size()
    for epoch in range(num_epochs):
        # Step
        train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
        val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
        scheduler.step(val_loss)

        # Checkpoint
        metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
        checkpoint = TorchCheckpoint.from_model(model=model)
        session.report(metrics, checkpoint=checkpoint)

### Configurations

In [23]:
# Train loop config
train_loop_config = {
    "dropout_p": 0.5,
    "lr": 1e-4,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 10,
    "batch_size": 256,
    "num_classes": num_classes,
}

# Scaling config
scaling_config = ScalingConfig(
    num_workers=num_workers,
    use_gpu=bool(resources_per_worker["GPU"]),
    resources_per_worker=resources_per_worker,
    _max_cpu_fraction_per_node=0.8,
)

# Run config
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(name="llm", checkpoint_config=checkpoint_config, local_dir="~/ray_results")

NameError: name 'num_workers' is not defined

### Training

In [24]:
# Load and split data
ds = load_data()
train_ds, val_ds = stratify_split(ds, stratify="tag", test_size=TEST_SIZE)

# Preprocess
preprocessor = customPreprocessor()
train_ds =  preprocessor.fit_transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()

2023-11-13 14:21:54,427	INFO read_api.py:406 -- To satisfy the requested parallelism of 8, each read task output is split into 8 smaller blocks.


AttributeError: Use `ds.count()` to compute the length of a distributed Dataset. This may be an expensive operation.

In [25]:
# Dataset config
dataset_config = {
    "train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
    "val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
}

In [None]:
# Trainer
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    run_config=run_config,
    datasets={"train": train_ds, "val": val_ds},
    dataset_config=dataset_config,
    preprocessor=preprocessor,
)

# Train
results = trainer.fit()

In [None]:
results.metrics_dataframe

In [None]:
results.best_checkpoints

### Evaluation

In [26]:
from ray.train.torch import TorchPredictor
from sklearn.metrics import precision_recall_fscore_support



In [None]:
# Predictor
best_checkpoint = results.best_checkpoints[0][0]
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
preprocessor = predictor.get_preprocessor()

In [None]:
# Test (holdout) dataset
HOLDOUT_LOC = 'holdout.csv'
test_ds = ray.data.read_csv(HOLDOUT_LOC)
preprocessed_ds = preprocessor.transform(test_ds)
preprocessed_ds.take(1)

In [None]:
# y_true
values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
y_true = np.stack([item["targets"] for item in values])
print (y_true)

In [None]:
# y_pred
z = predictor.predict(data=test_ds.to_pandas())["predictions"]
y_pred = np.stack(z).argmax(1)
print (y_pred)

In [None]:
# Evaluate
metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
{"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}

In [None]:
def evaluate(ds, predictor):
    # y_true
    preprocessor = predictor.get_preprocessor()
    preprocessed_ds = preprocessor.transform(ds)
    values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
    y_true = np.stack([item["targets"] for item in values])

    # y_pred
    z = predictor.predict(data=ds.to_pandas())["predictions"]
    y_pred = np.stack(z).argmax(1)

    # Evaluate
    metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    return performance

### Inferences

In [27]:
import pandas as pd

def format_prob(prob, index_to_class):
    d = {}
    for i, item in enumerate(prob):
        d[index_to_class[i]] = item
    return d

def decode(indices, idx_to_label):
    return [idx_to_label[index] for index in indices]


def predict_with_proba(df, predictor):
    preprocessor = predictor.get_preprocessor()
    z = predictor.predict(data=df)["predictions"]
    y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
    results = []
    for i, prob in enumerate(y_prob):
        tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
        results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
    return results

In [None]:
# Preprocessor
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
preprocessor = predictor.get_preprocessor()

In [None]:
# Predict on sample
title = "Transfer learning with transformers"
description = "Using transformers for transfer learning on text classification tasks."
sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
predict_with_proba(df=sample_df, predictor=predictor)