In [17]:
import sys, json, re, collections
import pandas as pd
from pathlib import Path
from decouple import config
sys.path.append("../src/")
from llm_helpers import openai_client, chat_gpt_ask_functions_multiclass, chat_gpt_ask_functions_most_likley

pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

data_dir = Path(".").absolute().parent/"data"
ls = lambda p:print("\n".join(map(str,p.iterdir())))

ls(data_dir)

C:\Users\TamirBracha\LLM\llm_workshop\data\sample_apps.parquet


In [18]:
df = pd.read_parquet(data_dir / "sample_apps.parquet").sample(9)
df

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios
5315,530168168,Paramount+,Welcome to A Mountain of Entertainment. Stream...,https://apps.apple.com/us/app/paramount/id5301...,Entertainment,True
24056,com.gramgames.mergedragons,Merge Dragons!,Discover a magical land of entertainment and m...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
35146,com.moonactive.coinmaster,Coin Master,Join your Facebook friends and millions of pla...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False
60188,net.supertreat.solitaire,Solitaire Grand Harvest,Welcome to Solitaire Grand Harvest! Play this ...,https://play.google.com/store/apps/details?id=...,"GAME_CARD,GAME",False
54216,de.cellular.ottohybrid,OTTO – Shopping & Möbel,Install the OTTO app now 📲 and shop fashion tr...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
383,air.com.buffalo_studios.newflashbingo,Bingo Blitz™️ - Bingo Games,Experience your free online bingo game as you ...,https://play.google.com/store/apps/details?id=...,"GAME_BOARD,GAME",False
24041,com.grabtaxi.passenger,Grab Superapp,Grab is Southeast Asia’s leading superapp. We ...,https://play.google.com/store/apps/details?id=...,"TRAVEL_AND_LOCAL,APPLICATION",False
7478,com.alibaba.intl.android.apps.poseidon,Alibaba.com - B2B marketplace,What is Alibaba.com?\nAlibaba.com is one of th...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
659,air.com.playtika.slotomania,Slotomania™ Slots Casino Games,Slotomania wishes you a Happy New Year! Play n...,https://play.google.com/store/apps/details?id=...,"GAME_CASINO,GAME",False


In [19]:
categories = df["category_names"].str.lower().str.split(',').explode().value_counts()
categories

category_names
game                5
application         3
shopping            2
entertainment       1
game_puzzle         1
game_casual         1
game_card           1
game_board          1
travel_and_local    1
game_casino         1
Name: count, dtype: int64

## Naive approach, just Ask nicely.

In [20]:
def openai_ask(prompts):
    response = openai_client.completions.create(
        model="text-davinci-003",
        prompt=prompts,
    )
    ret = [choice.text.strip().lower() for choice in response.choices]
    return ret


In [21]:
prompt = "please choose the most likely category that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
openai_ask(prompt)

['game_puzzle']

In [22]:
openai_ask(prompt)

['game_puzzle']

In [23]:
openai_ask(prompt)

['game_puzzle']

Seem to work pretty, well - let's try multi label?

In [24]:
prompt = "please choose all the categories that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
[sorted(map(str.strip,l.strip().replace(",", "\n").split("\n"))) for l in openai_ask([prompt]*10)]

[['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game', 'game_puzzle'],
 ['application', 'entertainment', 'game', 'game_cas', 'game_puzzle']]

In [25]:
"game casual" in categories

False

## Using function calls
### Choose most likely class

In [26]:
def classify_most_likely(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "class": {"type": "string", "enum": list(categories.index)},
                    },
                    "required": ["class"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["class"]

classify_most_likely("please choose the most likely category that apply to 'Crossword Jam'")

'game_puzzle'

In [27]:
def classify_multiclass(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "classes": {"type": "array", "items": {"type": "string", "enum": list(categories.index)}},
                    },
                    "required": ["classes"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["classes"]

classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_puzzle']

In [28]:
classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_puzzle']

## Question:
Please write ask the model to classify all of the apps using the methods we learnt.

Then create an additional column "jaccard" indicating whether intersection over union of the actual categories and the predicted ones.

Which method was most successful?

In [29]:
df["category_multiclass"] = df.apply(chat_gpt_ask_functions_multiclass("please choose the all the categories that apply to the title: {title}, and the description: {description}", categories), axis=1)
df

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios,category_multiclass
5315,530168168,Paramount+,Welcome to A Mountain of Entertainment. Stream...,https://apps.apple.com/us/app/paramount/id5301...,Entertainment,True,"[application, entertainment, game, game_casual..."
24056,com.gramgames.mergedragons,Merge Dragons!,Discover a magical land of entertainment and m...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False,"[game, game_puzzle, entertainment, game_casual]"
35146,com.moonactive.coinmaster,Coin Master,Join your Facebook friends and millions of pla...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False,"[game, application, game_casual, game_puzzle, ..."
60188,net.supertreat.solitaire,Solitaire Grand Harvest,Welcome to Solitaire Grand Harvest! Play this ...,https://play.google.com/store/apps/details?id=...,"GAME_CARD,GAME",False,"[game, application, game_puzzle, game_casual, ..."
54216,de.cellular.ottohybrid,OTTO – Shopping & Möbel,Install the OTTO app now 📲 and shop fashion tr...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False,"[shopping, application, fashion, technology, e..."
383,air.com.buffalo_studios.newflashbingo,Bingo Blitz™️ - Bingo Games,Experience your free online bingo game as you ...,https://play.google.com/store/apps/details?id=...,"GAME_BOARD,GAME",False,"[game, game_casual, game_board, game_casino, e..."
24041,com.grabtaxi.passenger,Grab Superapp,Grab is Southeast Asia’s leading superapp. We ...,https://play.google.com/store/apps/details?id=...,"TRAVEL_AND_LOCAL,APPLICATION",False,"[application, shopping, travel_and_local]"
7478,com.alibaba.intl.android.apps.poseidon,Alibaba.com - B2B marketplace,What is Alibaba.com?\nAlibaba.com is one of th...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False,"[application, shopping, game, travel_and_local]"
659,air.com.playtika.slotomania,Slotomania™ Slots Casino Games,Slotomania wishes you a Happy New Year! Play n...,https://play.google.com/store/apps/details?id=...,"GAME_CASINO,GAME",False,"[game, game_casino, entertainment]"


In [30]:
df["category_most_likley"] = df.apply(chat_gpt_ask_functions_most_likley("please choose the all the categories that apply to the title: {title}, and the description: {description}", categories), axis=1)
df

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios,category_multiclass,category_most_likley
5315,530168168,Paramount+,Welcome to A Mountain of Entertainment. Stream...,https://apps.apple.com/us/app/paramount/id5301...,Entertainment,True,"[application, entertainment, game, game_casual...",application
24056,com.gramgames.mergedragons,Merge Dragons!,Discover a magical land of entertainment and m...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False,"[game, game_puzzle, entertainment, game_casual]",game
35146,com.moonactive.coinmaster,Coin Master,Join your Facebook friends and millions of pla...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False,"[game, application, game_casual, game_puzzle, ...",game
60188,net.supertreat.solitaire,Solitaire Grand Harvest,Welcome to Solitaire Grand Harvest! Play this ...,https://play.google.com/store/apps/details?id=...,"GAME_CARD,GAME",False,"[game, application, game_puzzle, game_casual, ...",game
54216,de.cellular.ottohybrid,OTTO – Shopping & Möbel,Install the OTTO app now 📲 and shop fashion tr...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False,"[shopping, application, fashion, technology, e...",shopping
383,air.com.buffalo_studios.newflashbingo,Bingo Blitz™️ - Bingo Games,Experience your free online bingo game as you ...,https://play.google.com/store/apps/details?id=...,"GAME_BOARD,GAME",False,"[game, game_casual, game_board, game_casino, e...",game
24041,com.grabtaxi.passenger,Grab Superapp,Grab is Southeast Asia’s leading superapp. We ...,https://play.google.com/store/apps/details?id=...,"TRAVEL_AND_LOCAL,APPLICATION",False,"[application, shopping, travel_and_local]",application
7478,com.alibaba.intl.android.apps.poseidon,Alibaba.com - B2B marketplace,What is Alibaba.com?\nAlibaba.com is one of th...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False,"[application, shopping, game, travel_and_local]",application
659,air.com.playtika.slotomania,Slotomania™ Slots Casino Games,Slotomania wishes you a Happy New Year! Play n...,https://play.google.com/store/apps/details?id=...,"GAME_CASINO,GAME",False,"[game, game_casino, entertainment]",game_casino


In [31]:
def jaccard_set(list1, list2):
    """Define Jaccard Similarity function for two sets"""
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union