In [1]:
import os
import openai
import pandas as pd
from transformers import GPT2Tokenizer
from sklearn.metrics import adjusted_rand_score
import json

#Change this with your own open ai API key, create one here : https://platform.openai.com/account/api-keys
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

#Set your api key
openai.api_key = OPENAI_API_KEY


  from .autonotebook import tqdm as notebook_tqdm


# The dataset :
#### Online shop items listed with their categories, name and description

We will mainly use the product name in this notebook to save on the number of token 

In [3]:
text_data = pd.read_csv('./text_data.csv')
text_data.tail()

Unnamed: 0,product_category_tree,product_name,description
1045,Baby Care,Oren Empower Extra Large Self Adhesive Sticker,Oren Empower Extra Large Self Adhesive Sticker...
1046,Baby Care,Wallmantra Large Vinyl Sticker Sticker,Wallmantra Large Vinyl Sticker Sticker (Pack o...
1047,Baby Care,Uberlyfe Extra Large Pigmented Polyvinyl Films...,Buy Uberlyfe Extra Large Pigmented Polyvinyl F...
1048,Baby Care,Wallmantra Medium Vinyl Sticker Sticker,Buy Wallmantra Medium Vinyl Sticker Sticker fo...
1049,Baby Care,Uberlyfe Large Vinyl Sticker,Buy Uberlyfe Large Vinyl Sticker for Rs.595 on...


**First goal is to create a string that we can pass to the model**

We will format it as **index** : **product name**

In [3]:
# Convert the product_name column into a numbered list format
formatted_product_names = [f"{i}: {x}" for i, x in enumerate(text_data['product_name'].head(5))]

# Convert the entire series into a single string
products_string = ', '.join(formatted_product_names)

In [30]:
formatted_product_names

['0: Elegance Polyester Multicolor Abstract Eyelet Door Curtain',
 '1: Sathiyas Cotton Bath Towel',
 '2: Eurospa Cotton Terry Face Towel Set',
 '3: SANTOSH ROYAL FASHION Cotton Printed King sized Double Bedsheet',
 '4: Jaipur Print Cotton Floral King sized Double Bedsheet']

In [26]:
products_string

'0: Elegance Polyester Multicolor Abstract Eyelet Door Curtain, 1: Sathiyas Cotton Bath Towel, 2: Eurospa Cotton Terry Face Towel Set, 3: SANTOSH ROYAL FASHION Cotton Printed King sized Double Bedsheet, 4: Jaipur Print Cotton Floral King sized Double Bedsheet'

**One of the limitation of LLMs is the number of tokens they can take as input**

To understand or have an idea of how many tokens does our string is we can use the GPT2Tokenizer

In [31]:
len(products_string)

258

In [32]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokens = tokenizer.tokenize(products_string)
token_count = len(tokens)
print(token_count)

75


# How does the gpt API work ?
- create a ChatCompletion
- specify a model (gpt-3.5-turbo, gpt4)
- temperature higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
- messages, the contexte of the conversation, there are different roles:
    - system : to set the models general role
    - user : that's you
    - assistant : that's the model, building contexte you can put words in its mouth to help him help you

In [8]:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        temperature=0.2,
        messages=[
            {"role": "system", "content": "You are a helpful assistant tasked to classify text into 7 categories."},
            {"role": "user", "content": "here is a list of the categories: 1: Baby Care, 2: Beauty and Personal Care, 3: Computers, 4: Home Decor & Festive Needs, 5: Home Furnishing, 6: Kitchen & Dining, 7: Watches"},
            {"role": "assistant", "content": "How should I format my answer?"},
            {"role": "user", "content": "Answer in a json formating style {\"sentence index\" : category number}"},
            {"role": "assistant", "content": "Okay give me sentences to categorize"},
            {"role": "user", "content": f"{products_string}"}
        ]
    )

    print(response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"0\": 4,\n  \"1\": 1,\n  \"2\": 1,\n  \"3\": 4,\n  \"4\": 4\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1692697699,
  "id": "chatcmpl-7qIHjeA7i6Z6VAysA4TQE27iiU8vT",
  "model": "gpt-3.5-turbo-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 37,
    "prompt_tokens": 196,
    "total_tokens": 233
  }
}


**To get the json we just need to acces the message.content in the response.choices**

In [19]:
response['choices'][0]['message']['content']

'{\n  "0": 4,\n  "1": 1,\n  "2": 1,\n  "3": 4,\n  "4": 4\n}'

In [18]:
# To get the answer string :
content = response.choices[0].message.content
print(content)

'{\n  "0": 4,\n  "1": 1,\n  "2": 1,\n  "3": 4,\n  "4": 4\n}'

In [13]:
#because we are expecing a json format we can directly load it like this:
result = {}
result = json.loads(response.choices[0].message.content)
print(result)

{'0': 4, '1': 1, '2': 1, '3': 4, '4': 4}


# Pretty cool !
**Let's now do it for the full dataset**

In [116]:
result_json = {}
i = 0

# Create a function to merge two dictionaries
def merge_dicts(dict1, dict2):
    merged_dict = {**dict1, **dict2}
    return merged_dict

while i < text_data.shape[0]:
    # Take the text_data 100 rows by 100 rows to stay in the token limit
    end_index = min(i + 100, text_data.shape[0])  # Handle the case where there's less than 100 rows left
    
    # Select the current chunk of 100 (or less) rows
    current_chunk = text_data['product_name'].iloc[i:end_index]
    
    
    formatted_product_names = [f"{idx+i}: {x}" for idx, x in enumerate(current_chunk)]

    products_string = ', '.join(formatted_product_names)

    response = openai.ChatCompletion.create(
        model="gpt-4",
        temperature=0.2,
        messages=[
            {"role": "system", "content": "You are a helpful assistant tasked to classify text into 7 categories."},
            {"role": "user", "content": "here is a list of the categories: 1: Baby Care, 2: Beauty and Personal Care, 3: Computers, 4: Home Decor & Festive Needs, 5: Home Furnishing, 6: Kitchen & Dining, 7: Watches"},
            {"role": "assistant", "content": "How should I format my answer?"},
            {"role": "user", "content": "Answer in a json formating style {\"sentence index\" : category number}"},
            {"role": "assistant", "content": "Okay give me sentences to categorize"},
            {"role": "user", "content": f"{products_string}"}
        ]
    )
    
    
    result = {}
    for choice in response.choices:
        result = json.loads(choice.message.content)
    # Merge the results
    result_json = merge_dicts(result_json, result)
    
    # Move to the next chunk of data
    i += 100

# Save the final combined results
with open('result.json', 'w') as f_out:
    json.dump(result_json, f_out)

### If you don't want to run it again, since it's not free you can just load the results :

In [20]:
with open('./result_gpt3.5.json', 'r') as f:
    result_gpt3 = json.load(f)


with open('./result_gpt4.json', 'r') as f:
    result_gpt4 = json.load(f)


In [21]:
# Convert the JSON object to a DataFrame
df_classification_gpt4 = pd.DataFrame(list(result_gpt4.items()), columns=['Product Index', 'Category Number'])
df_classification_gpt3 = pd.DataFrame(list(result_gpt3.items()), columns=['Product Index', 'Category Number'])

print(df_classification_gpt4)

     Product Index  Category Number
0                0                5
1                1                5
2                2                5
3                3                5
4                4                5
...            ...              ...
1045          1045                4
1046          1046                4
1047          1047                4
1048          1048                4
1049          1049                4

[1050 rows x 2 columns]


# Moment of truth how good is the result:

In [39]:
# Calculate the Rand index
rand_index = adjusted_rand_score(text_data['product_category_tree'], df_classification_gpt3['Category Number'])
print(f'Rand index for gpt3: {rand_index}')

rand_index = adjusted_rand_score(text_data['product_category_tree'], df_classification_gpt4['Category Number'])
print(f'Rand index for gpt4: {rand_index}')

Rand index for gpt3: 0.45872756532020414
Rand index for gpt4: 0.8027008491002728



## And compared to traditional Text Transformation Techniques

The traditional approach to handling text involves using algorithms to transform the text into vectors.

Numerous algorithms exist for this purpose, ranging from simple word counting methods to more intricate techniques. Commonly, these algorithms incorporate processes like stemming, lemmatization, and the removal of punctuation and stopwords.

For those interested, a small exploration can be found in the notebook titled `text_classification_p6`, where I experimented with `CountVectorizer`, `TfidfVectorizer`, `word2vec`, `USE` and `BERT`. This notebook is available in the same repository under the `image_text_classification` section.

While it might not be entirely equitable to draw direct comparisons, below are the Rand indices I obtained for each of the models:

- **TfidfVectorizer Rand index:** 0.49126859466672784 
- **Word2vec Rand index:** 0.1925855267844711  
- **BERT Rand index:** 0.4134534468309783  
- **USE Rand index:** 0.4660123193298835  



