In [1]:
import sys, json, re
import pandas as pd
from pathlib import Path
from decouple import config
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForMaskedLM,  file_utils
cache_dir = Path(file_utils.default_cache_path)

pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

data_dir = Path(".").absolute().parent/"data"
ls = lambda p:print("\n".join(map(str,p.iterdir())))

ls(data_dir)
#hf_model_name = "gpt2"
hf_model_name = "mistralai/Mistral-7B-v0.1"

  from .autonotebook import tqdm as notebook_tqdm


C:\Users\TamirBracha\LLM\llm_workshop\data\sample_apps.parquet


In [2]:
df = pd.read_parquet(data_dir / "sample_apps.parquet").sample(9)
categories = df["category_names"].str.lower().str.split(',').explode().value_counts()
df.sample(9)

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios
6956,930574573,Sniper 3D: Gun Shooting Games,Ready to have FUN? Download now the best shoot...,https://apps.apple.com/us/app/sniper-3d-gun-sh...,"Games,Action,Casual,Entertainment",True
5959,664575829,Fishdom,Never Fishdomed before? Then take a deep breat...,https://apps.apple.com/us/app/fishdom/id664575...,"Games,Puzzle,Entertainment,Simulation",True
27431,com.inspiredsquare.jupiter,2248 - Number Puzzle Game,2248 Number Block Puzzle Game2248 Puzzle Game:...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
39423,com.pinterest,Pinterest,Pinterest is the place to explore inspiration....,https://play.google.com/store/apps/details?id=...,"LIFESTYLE,APPLICATION",False
36869,com.nexters.herowars,Hero Wars – Fantasy Battles,Unlock skills and battle enemies with Hero War...,https://play.google.com/store/apps/details?id=...,"GAME_ROLE_PLAYING,GAME",False
7478,com.alibaba.intl.android.apps.poseidon,Alibaba.com - B2B marketplace,What is Alibaba.com?\nAlibaba.com is one of th...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
60188,net.supertreat.solitaire,Solitaire Grand Harvest,Welcome to Solitaire Grand Harvest! Play this ...,https://play.google.com/store/apps/details?id=...,"GAME_CARD,GAME",False
21686,com.futureplay.mergematch,Merge Gardens,Do you dream about your own gentle garden? You...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
56472,in.playsimple.wordtrip,Word Trip,WINNER OF THE PRESTIGIOUS ACADEMICS' CHOICE MI...,https://play.google.com/store/apps/details?id=...,"GAME_WORD,GAME",False


# Verbalizers

## Verbalizers as masks

Most generation models we used so far are `CausalLM` trained to predict the next token.

However, we can use `MaskedLM` models (that tend to be smaller) if we are looking for a completion mid-sentence

In [3]:
def masked_lm_yes_or_no(txt, model_str):
  assert "<mask>" in txt
  tokenizer = AutoTokenizer.from_pretrained(model_str)
  r = [t for t in tokenizer.encode("yes or no") if t!=tokenizer.bos_token_id and t!=tokenizer.eos_token_id]
  yes,_,no = r
  model = AutoModelForMaskedLM.from_pretrained(model_str)
  # model = AutoModelForSeq2SeqLM.from_pretrained(model_str)
  X = tokenizer.encode(txt, return_tensors="pt")
  y = model(X)
  masked_tup = (X==tokenizer.mask_token_id).nonzero(as_tuple=True)
  mask_idx = list(masked_tup[1].numpy())[0]
  ret = torch.vstack(
  [y.logits[:,mask_idx,no].reshape(-1),
    y.logits[:,mask_idx,yes].reshape(-1)],
  ).argmax(axis=0)
  return ret

In [None]:
masked_lm_yes_or_no("Is an apple a fruit? answer: <mask>", "facebook/bart-large")

## Verbalizers from generation models

In [5]:
def causal_lm_yes_or_no(txt, model_str):
  tokenizer = AutoTokenizer.from_pretrained(model_str)
  r = [t for t in tokenizer.encode("yes or no") if t!=tokenizer.bos_token_id and t!=tokenizer.eos_token_id]
  yes,_,no = r
  model = AutoModelForCausalLM.from_pretrained(model_str)
  X = tokenizer.encode(txt, return_tensors="pt")
  y = model(X)
  ret = torch.vstack(
  [y.logits[:,-1,no].reshape(-1),
    y.logits[:,-1,yes].reshape(-1)],
  ).argmax(axis=0)
  return ret

In [6]:
causal_lm_yes_or_no("Is an apple a fruit? answer: <mask>", "gpt2")

Downloading config.json: 100%|██████████| 665/665 [00:00<00:00, 753kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 6.27MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.73MB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 7.97MB/s]
Downloading model.safetensors: 100%|██████████| 548M/548M [00:49<00:00, 11.1MB/s] 
Downloading generation_config.json: 100%|██████████| 124/124 [00:00<?, ?B/s] 


tensor([0])

# JSONFormer
JSONFormer constraints the decoder to only output the most-likely token that would result in a valid json according to a predefined schema.

In [None]:
from jsonformer import Jsonformer

model = AutoModelForCausalLM.from_pretrained(hf_model_name)
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "number"},
        "is_for_kids": {"type": "boolean"},
        "categories": {
            "type": "array",
            "items": {"type": "string"}
        }
    }
}

prompt = "Please describe 'Candy crush' with the following schema"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt)
generated_data = jsonformer()

print(generated_data)

Downloading config.json: 100%|██████████| 571/571 [00:00<?, ?B/s] 
Downloading (…)model.bin.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 22.4MB/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]
Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s][A
Downloading (…)l-00001-of-00002.bin:   0%|          | 10.5M/9.94G [00:00<14:14, 11.6MB/s][A
Downloading (…)l-00001-of-00002.bin:   0%|          | 21.0M/9.94G [00:01<14:22, 11.5MB/s][A
Downloading (…)l-00001-of-00002.bin:   0%|          | 31.5M/9.94G [00:02<14:18, 11.5MB/s][A
Downloading (…)l-00001-of-00002.bin:   0%|          | 41.9M/9.94G [00:03<14:16, 11.6MB/s][A
Downloading (…)l-00001-of-00002.bin:   1%|          | 52.4M/9.94G [00:04<14:14, 11.6MB/s][A
Downloading (…)l-00001-of-00002.bin:   1%|          | 62.9M/9.94G [00:05<14:11, 11.6MB/s][A
Downloading (…)l-00001-of-00002.bin:   1%|          | 73.4M/9.94G [00:06<14:11, 11.6MB/s][A
Downloading (…)l-00001-of-00002.bin:   1%|        

# Guidance
Guidance is a very popular library for decoder constraints, that is much more "user-friendly" than JSONFormer.

In [None]:
from guidance import models, select, gen
llm = models.Transformers(hf_model_name)

In [None]:
prompt = "Please categorize the mobile app 'slotomania'"
llm + gen(prompt, max_tokens=10)

In [26]:
app = "Solitaire Grand Harvest"

llm + f'{app} is ' + select(list(categories.index))

# Exercise 3
Answer the questions in exercise 1 with `Mistral-7B`