### Load Data

In [3]:
import pandas as pd

dataset_loc = 'dataset.csv'

train_df = pd.read_csv(dataset_loc)
train_df.head()

Unnamed: 0,id,created_on,title,description,tag
0,6,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,computer-vision
1,7,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,computer-vision
2,9,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",other
3,15,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,other
4,25,2020-03-07 23:04:31,AttentionWalk,"A PyTorch Implementation of ""Watch Your Step: ...",other


In [23]:
# Unique labels

tags = train_df.tag.unique().tolist()
tags

['computer-vision', 'other', 'natural-language-processing', 'mlops']

In [10]:
# Load inference dataset

holdout_dataset = 'holdout.csv'

test_df = pd.read_csv(holdout_dataset)
test_df.head()



Unnamed: 0,id,created_on,title,description,tag
0,19,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,other
1,26,2020-03-07 23:11:58,Graph Wavelet Neural Network,"A PyTorch implementation of ""Graph Wavelet Neu...",other
2,44,2020-03-08 00:32:58,Capsule Graph Neural Network,"A PyTorch implementation of ""Capsule Graph Neu...",other
3,80,2020-03-20 05:59:32,NeRF: Neural Radiance Fields,Representing scenes as neural radiance fields ...,computer-vision
4,84,2020-03-20 15:18:43,Mention Classifier,Category prediction model\r\nThis repo contain...,natural-language-processing


### Utilities


In [11]:
import json
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
from sklearn.metrics import precision_recall_fscore_support
import time
from tqdm import tqdm

In [8]:
! pip install -q openai

You should consider upgrading via the 'D:\MadeWithMl\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
from openai import OpenAI

system_content = "you only answer in rhymes"  # system content (behavior)
assistant_content = ""  # assistant content (context)
user_content = "how are you"  # user content (message)

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="My API Key",
)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[        
        {"role": "system", "content": system_content},
        {"role": "assisstant" , "content": assistant_content},
        {"role": "user", "content": user_content},
    ],    
)
print(response['choices'][0].message.content)


### Create a function to predict tags

In [None]:
def get_tag(model, system_content="", assistant_content="", user_content=""):
    try:

        client = OpenAI(
            # defaults to os.environ.get("OPENAI_API_KEY")
            api_key="My API Key",
        )
        # Get response from OpenAI
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[        
                {"role": "system", "content": system_content},
                {"role": "assisstant" , "content": assistant_content},
                {"role": "user", "content": user_content},
        ],    
        )
        predicted_tag = response["choices"][0].message.content
        return predicted_tag

    except (OpenAI.error.ServiceUnavailableError, OpenAI.error.APIError) as e:
        return None
    

model = "gpt-3.5-turbo-0613"
system_context = f"""
    You are a NLP prediction service that predicts the label given an input's title and description.
    You must choose between one of the following labels for each input: {tags}.
    Only respond with the label name and nothing else.
    """
assistant_content = ""
user_context = "Transfer learning with transformers: Using transformers for transfer learning on text classification tasks."
tag = get_tag(model=model, system_content=system_context, assistant_content=assistant_content, user_content=user_context)
print (tag)

### Create a function to predict tags for a list of inputs

In [None]:
samples = test_df[["title", "description"]].to_dict(orient="records")[:3]

def predict_tags(inputs, model, system_content='', assistant_content= ''):
    y_pred = []

    for item in tqdm(inputs):    
    # Convert item dict to string
        user_content = str(item)

        # Get prediction
        predicted_tag = get_tag(
            model=model, system_content=system_content,
            assistant_content=assistant_content, user_content=user_content)
    
    y_pred.append(predicted_tag)
    return y_pred

# Get predictions for a list of inputs
predict_tags(inputs=samples, model=model, system_content=system_context)


### Create a function to clean our predicted tags

In [18]:
def clean_tags(y_pred, tags, default = 'other'):
    for i, tag in enumerate(y_pred):

        if tag not in tags:
            y_pred[i] = default
        if tag.startswith("'") and tag.endswith("'"):
            y_pred[i] = tag[1:-1]
    
    return y_pred

In [19]:
def plot_tag_dist(y_true, y_pred):
    # Distribution of tags
    true_tag_freq = dict(Counter(y_true))
    pred_tag_freq = dict(Counter(y_pred))
    df_true = pd.DataFrame({"tag": list(true_tag_freq.keys()), "freq": list(true_tag_freq.values()), "source": "true"})
    df_pred = pd.DataFrame({"tag": list(pred_tag_freq.keys()), "freq": list(pred_tag_freq.values()), "source": "pred"})
    df = pd.concat([df_true, df_pred], ignore_index=True)

    # Plot
    plt.figure(figsize=(10, 3))
    plt.title("Tag distribution", fontsize=14)
    ax = sns.barplot(x="tag", y="freq", hue="source", data=df)
    ax.set_xticklabels(list(true_tag_freq.keys()), rotation=0, fontsize=8)
    plt.legend()
    plt.show()

### Create a function to combine all the utiitites above

In [22]:
def evaluate(test_df, model, system_content, tags, assistant_content=""):
    # Predictions
    y_test = test_df.tag.to_list()
    test_samples = test_df[["title", "description"]].to_dict(orient="records")
    y_pred = predict_tags(
        inputs=test_samples, model=model,
        system_content=system_content, assistant_content=assistant_content)
    y_pred = clean_tags(y_pred=y_pred, tags=tags)

    # Performance
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    print(json.dumps(performance, indent=2))
    plot_tag_dist(y_true=y_test, y_pred=y_pred)
    return y_pred, performance

### Zero Shot learning

In [None]:
y_pred = {"zero_shot": {}, "few_shot": {}}
performance = {"zero_shot": {}, "few_shot": {}}

system_content = f"""
    You are a NLP prediction service that predicts the label given an input's title and description.
    You must choose between one of the following labels for each input: {tags}.
    Only respond with the label name and nothing else.
    """

# Zero-shot with GPT 3.5

method = "zero_shot"
model = "gpt-3.5-turbo-0613"
y_pred[method][model], performance[method][model] = evaluate(
    test_df=test_df, model=model, system_content=system_content, tags=tags)

### Few-Shot learning

In [None]:
# Create additional context with few samples from each class

num_samples = 2
additional_context = []
cols_to_keep = ["title", "description", "tag"]
for tag in tags:
    samples = train_df[cols_to_keep][train_df.tag == tag][:num_samples].to_dict(orient="records")
    additional_context.extend(samples)

assistant_content = f"""Here are some examples with the correct labels: {additional_context}"""

# Few-shot with GPT 3.5

method = "few_shot"
model = "gpt-3.5-turbo-0613"
y_pred[method][model], performance[method][model] = evaluate(
    test_df=test_df, model=model, system_content=system_content,
    assistant_content=assistant_content, tags=tags)

### Setup

In [6]:
import os
import random
import torch
from ray.data.preprocessor import Preprocessor
import numpy as np
import ray

In [3]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)
    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
    os.environ["PYTHONHASHSEED"] = str(seed)

In [7]:
def load_data(num_samples=None):
    ds = ray.data.read_csv('dataset.csv')
    ds = ds.random_shuffle(seed=1234)
    ds = ray.data.from_items(ds.take(num_samples) if num_samples else ds)
    return ds

In [8]:
from madewithml.data import preprocess

class customPreprocessor(Preprocessor):
    def _fit(self, ds):
        tags = ds.unique(column='tag')
        self.class_to_index = {tag:i for i, tag in enumerate(tags)}
        self.index_to_class = {k:v for v, k in self.class_to_index.items()}
    
    def _transform_pandas(self, batch):
        return preprocess(batch, class_to_index=self.class_to_index)



### Model

In [9]:
from torch import nn
from transformers import BertModel

llm = BertModel.from_pretrained('allenai/scibert_scivocab_uncased', return_dict= False)
embedding_dim = llm.config.hidden_size

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the ch

In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', returned_dict = False)

# Sample

text = "Transfer learning with transformers for text classification."
batch = tokenizer([text], return_tensors="np", padding="longest")
batch = {k:torch.tensor(v) for k,v in batch.items()}  # convert to torch tensors
seq, pool = llm(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])