### Importing libraries

In [240]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from typing import List, Literal, Annotated
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.tools import tool
import getpass
import numpy as np
from datasets import load_dataset
from prompt_poet import Prompt
from tqdm import tqdm
import pandas as pd

### Groq Init

In [2]:
groq_api_key = getpass.getpass()

 ········


In [3]:
llama_70b_llm = ChatGroq(api_key=groq_api_key, temperature=0, model_name="llama3-groq-70b-8192-tool-use-preview")

### Classification Models

###### Traditional Models

- Data Collection and Labelling
- Feature Engineering
- Model Training & Validation
- Once the model is built, inference is efficient because of small model sizes.

Approaches

- TfIdf Vectorization + Classifier (Navie Bayes, Logistic Regression etc.)
- Word2Vec + Classifier Head
- BERT encoder + Classification

Drawbacks

- Time to label datasets
- Out of Vocabulary Words
- Generalization Error

##### LLMs

- Zero Shot or few shot learners
- No feature engineering required
- Can be finetuned
- Great out-of-sample performance

Drawbacks

- Huge parameter LLMs at the backend
- Proprietary models, data security and privacy issues
- Inference is costly, takes time, needs high coumputing resources
- Smaller LLMs with similar performance on the level of GPT are needed for finetuning.

#### Zero shot Example

In [4]:
raw_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in classifying a given text into {{text_classfication_classes}}

- name: user query
  role: user
  content: |
   Extract the label of the following text: {{text}}
"""

In [5]:
template_data = {"text_classfication_classes": "Spam and Ham", "text": "Win $1000000 NOW!!!"}

In [6]:
prompt = Prompt(
    raw_template=raw_template,
    template_data=template_data
)


In [7]:
prompt.messages

[{'role': 'system',
  'content': 'You are an expert in classifying a given text into Spam and Ham'},
 {'role': 'user',
  'content': 'Extract the label of the following text: Win $1000000 NOW!!!'}]

In [8]:
response = llama_70b_llm.invoke(prompt.messages)

In [9]:
print(response.content)

The label of the given text is Spam.


#### Adding Structured Outputs

In [61]:
class Classification(BaseModel):
    classification_label: str= Field(enum=["spam","ham"])
    explanation: str = Field(description="Explain why you gave that label to this text. Keep your answers short and precise. I will tip you $20 for a good explanation. ")

In [62]:
llama_70b_llm_cls_head = llama_70b_llm.with_structured_output(Classification)

In [63]:
prompt.messages

[{'role': 'system',
  'content': 'You are an expert in classifying a given text into Spam and Ham'},
 {'role': 'user',
  'content': 'Extract the label of the following text: Change in TER Schemes of quant mutual fund'}]

In [64]:
result = llama_70b_llm_cls_head.invoke(prompt.messages)

In [65]:
result

Classification(classification_label='ham', explanation='The text is about a change in the TER schemes of a mutual fund, which is a legitimate topic of discussion.')

In [66]:
template_data = {"text_classfication_classes": "Spam and Ham", "text": "Change in TER Schemes of quant mutual fund"}

non_spam_prompt = Prompt(
    raw_template=raw_template,
    template_data=template_data
)
prompt = Prompt(
    raw_template=raw_template,
    template_data=template_data
)


In [67]:
result = llama_70b_llm_cls_head.invoke(prompt.messages)

In [68]:
result

Classification(classification_label='ham', explanation='The text is about a change in the TER schemes of a mutual fund, which is a legitimate topic of discussion.')

#### Few Shot Examples

- In Zero shot learning, we are only relying on LLMs pretraining
- In a few shot approach, we feed the LLM with few examples from the training set and their labels.

In [69]:
ds = load_dataset("ucirvine/sms_spam")['train'].train_test_split(test_size=0.01)

In [70]:
np.random.randint(0, len(ds['train']), 5)

array([3836, 5232, 5160,  570,  128])

In [71]:
ds['train'][np.random.randint(0, len(ds['train']), 5)]

{'sms': ['Maybe i could get book out tomo then return it immediately ..? Or something.\n',
  "Ok... Help me ask if she's working tmr a not?\n",
  'Not yet chikku..wat abt u?\n',
  ':-) :-)\n',
  'Dear i am not denying your words please\n'],
 'label': [0, 0, 0, 0, 0]}

Let's select 5 examples from each class to train few shot model.

In [273]:
def generate_samples(dataset, num_samples_per_class=5, label_column=None, text_column=None):
    if label_column is None or text_column is None:
        raise ValueError("Both label_column and text_column must be provided.")

    # Get unique labels and shuffle the dataset
    unique_labels = dataset.unique(label_column)
    dataset = dataset.shuffle(seed=42)
    label_names = dataset.features[label_column].names

    # Initialize a dictionary to store samples per class name
    samples_per_class = {label_name: [] for label_name in label_names}

    # Collect samples for each class
    for example in dataset:
        label = example[label_column]
        label_name = label_names[label]
        if len(samples_per_class[label_name]) < num_samples_per_class:
            samples_per_class[label_name].append(example)

    # Create a list of {label, text} pairs
    label_text_pairs = []
    for label_name, samples in samples_per_class.items():
        for sample in samples:
            label_text_pairs.append({"label": label_name, "text": sample[text_column]})

    # Yield (text, label) pairs
    for each_sample in label_text_pairs:
        yield (each_sample['text'].strip(), each_sample['label'].strip())


In [276]:
samples = generate_samples(ds['train'],text_column='sms',label_column='label')

In [277]:
samples = list(samples)

In [278]:
samples

[('Waiting for your call.', 'ham'),
 ('Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering...',
  'ham'),
 ('So do you have samus shoulders yet', 'ham'),
 ('I attended but nothing is there.', 'ham'),
 ("Thank you so much. When we skyped wit kz and sura, we didnt get the pleasure of your company. Hope you are good. We've given you ultimatum oh! We are countin down to aburo. Enjoy!",
  'ham'),
 ('Great News! Call FREEFONE 08006344447 to claim your guaranteed £1000 CASH or £2000 gift. Speak to a live operator NOW!',
  'spam'),
 ('FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 8007 Get txting and tell ur mates www.getzed.co.uk POBox 36504 W45WQ norm150p/tone 16+',
  'spam'),
 ('Hey I am really horny want to chat or see me naked text hot to 69698 text charged at 150pm to unsubscribe text stop 69698',
  'spam'),
 ('No 1 POLYPHONIC tone 4 ur mob every week! Just txt PT2 to 87575. 1st Tone FREE ! so get txtin now and tell ur fr

In [199]:
few_shot_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in classifying a given text into {{ text_classfication_classes }}.
   These are some of the examples that you can use to do this task.
   {% for each_example, each_label in samples %} 
   Text: {{ each_example }} Label: {{ each_label}}
   {% endfor %}

- name: user query
  role: user
  content: |
   Extract the label of the following text: 
   {{text}} Label: 
"""

In [200]:
template_data = {"text_classfication_classes": "Spam or Ham",
                "text": "No Deposit Required. Play for FREE and Win for Real!..-ettzhr.",
                "samples":samples}

In [201]:
few_shot_prompt = Prompt(
    raw_template=few_shot_template,
    template_data=template_data
)

In [202]:
few_shot_prompt.messages

[{'role': 'system',
  'content': "You are an expert in classifying a given text into Spam or Ham.\nThese are some of the examples that you can use to do this task.\n \nText: Waiting for your call. Label: ham\n \nText: Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering... Label: ham\n \nText: So do you have samus shoulders yet Label: ham\n \nText: I attended but nothing is there. Label: ham\n \nText: Thank you so much. When we skyped wit kz and sura, we didnt get the pleasure of your company. Hope you are good. We've given you ultimatum oh! We are countin down to aburo. Enjoy! Label: ham\n \nText: Great News! Call FREEFONE 08006344447 to claim your guaranteed £1000 CASH or £2000 gift. Speak to a live operator NOW! Label: spam\n \nText: FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 8007 Get txting and tell ur mates www.getzed.co.uk POBox 36504 W45WQ norm150p/tone 16+ Label: spam\n \nText: Hey I am really horny want to c

In [203]:
print(few_shot_prompt.messages[0]['content'])

You are an expert in classifying a given text into Spam or Ham.
These are some of the examples that you can use to do this task.
 
Text: Waiting for your call. Label: ham
 
Text: Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering... Label: ham
 
Text: So do you have samus shoulders yet Label: ham
 
Text: I attended but nothing is there. Label: ham
 
Text: Thank you so much. When we skyped wit kz and sura, we didnt get the pleasure of your company. Hope you are good. We've given you ultimatum oh! We are countin down to aburo. Enjoy! Label: ham
 
Text: Great News! Call FREEFONE 08006344447 to claim your guaranteed £1000 CASH or £2000 gift. Speak to a live operator NOW! Label: spam
 
Text: FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 8007 Get txting and tell ur mates www.getzed.co.uk POBox 36504 W45WQ norm150p/tone 16+ Label: spam
 
Text: Hey I am really horny want to chat or see me naked text hot to 69698 text charged 

In [204]:
prompt.messages

[{'role': 'system',
  'content': 'You are an expert in classifying a given text into Spam and Ham'},
 {'role': 'user',
  'content': 'Extract the label of the following text: Change in TER Schemes of quant mutual fund'}]

In [205]:
llama_70b_llm_cls_head = llama_70b_llm.with_structured_output(Classification)

In [206]:
result = llama_70b_llm_cls_head.invoke(few_shot_prompt.messages)

In [207]:
result

Classification(classification_label='spam', explanation='The text contains promotional language and a call to action, which is typical of spam messages.')

##### Validation

In [264]:
zero_shot_results= []
for each_sample in tqdm(ds['test']):
    try:
        template_data = {"text_classfication_classes": "Spam and Ham", "text": each_sample['sms'].strip()}
    
        zero_shot_prompt = Prompt(
            raw_template=raw_template,
            template_data=template_data
        )
        validation = llama_70b_llm_cls_head.invoke(zero_shot_prompt.messages)
        each_sample['classification_label'] = validation.classification_label
        each_sample['explanation'] = validation.explanation
        zero_shot_results.append(each_sample)
    except:
        import time
        time.sleep(10)

100%|███████████████████████████████████████████████████████████████████████████████████| 56/56 [01:35<00:00,  1.70s/it]


In [265]:
zero_shot_results = pd.DataFrame(zero_shot_results)

In [266]:
zero_shot_results['classification_id'] = zero_shot_results['classification_label'].apply(lambda x: 1 if x == "spam" else 0)

In [268]:
# Acuuracy
(zero_shot_results['label'] == zero_shot_results['classification_id']).sum() / zero_shot_results.shape[0]

0.9818181818181818

In [271]:
zero_shot_results[zero_shot_results['label'] != zero_shot_results['classification_id']]

Unnamed: 0,sms,label,classification_label,explanation,classification_id
15,"I've got &lt;#&gt; , any way I could pick up?\n",0,spam,The text contains a suspicious link and is ask...,1


In [None]:
few_shot_results['classification_id'] = few_shot_results['classification_label'].apply(lambda x: 1 if x == "spam" else 0)

In [238]:
few_shot_results= []
for each_sample in ds['test']:
    template_data = {"text_classfication_classes": "Spam or Ham",
                "text": each_sample['sms'].strip(),
                "samples":samples}
    few_shot_prompt = Prompt(
    raw_template=few_shot_template,
    template_data=template_data
    )
    validation = llama_70b_llm_cls_head.invoke(few_shot_prompt.messages)
    each_sample['classification_label'] = validation.classification_label
    each_sample['explanation'] = validation.explanation
    few_shot_results.append(each_sample)

In [242]:
few_shot_results = pd.DataFrame(few_shot_results)

In [246]:
few_shot_results['classification_id'] = few_shot_results['classification_label'].apply(lambda x: 1 if x == "spam" else 0)

In [251]:
# Acuuracy
(few_shot_results['label'] == few_shot_results['classification_id']).sum() / few_shot_results.shape[0]

0.9821428571428571

In [259]:
few_shot_results[few_shot_results['label'] != few_shot_results['classification_id']]

Unnamed: 0,sms,label,classification_label,explanation,classification_id
45,HCL chennai requires FRESHERS for voice proces...,0,spam,The text contains a job advertisement with a s...,1


In [256]:
few_shot_results[few_shot_results['label'] != few_shot_results['classification_id']]['sms'].iloc[0]

'HCL chennai requires FRESHERS for voice process.Excellent english needed.Salary upto  &lt;#&gt; .Call Ms.Suman  &lt;#&gt;  for Telephonic interview -via Indyarocks.com\n'

In [258]:
few_shot_results[few_shot_results['label'] != few_shot_results['classification_id']]['explanation'].iloc[0]

'The text contains a job advertisement with a salary offer and a contact number, which is a common pattern in spam messages.'

We developed a model with more than 98% accuracy by leveraging the capabilities of LLMs to generalize (it's probably fine tuned on this dataset as well).