In [1]:
import pandas as pd

In [2]:
from datasets import load_dataset

dataset = load_dataset('OpenAssistant/oasst2')

In [3]:
df = dataset['train'].to_pandas()

In [4]:
type(df["detoxify"].iloc[0])

dict

In [5]:
prompter_df = df[df['role'] == 'prompter']

In [6]:
from typing import Any, Dict
from langchain_ollama import ChatOllama


llm = ChatOllama(
    model="llama3.1",
    temperature=0,
)

In [10]:
llm.invoke("How are you?")

KeyboardInterrupt: 

In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0)

In [16]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.2-90b-text-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="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"
)

In [17]:
from langchain_core.output_parsers import StrOutputParser
from pydantic import BaseModel
from langchain_core.pydantic_v1 import Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

class ClassifyUserIntent(BaseModel):
    """An enum value to classify user intent."""
    intent: str = Field(
        description="The classified intent of the user query, must be one of: 'Summarization', 'Translation', 'Paraphrasing', 'Role-play', 'Miscellaneous'."
    )

system = """
You are an intent classification system. The correctness of the classification is crucial.

We provide you with the intents and their descriptions:
- Summarization: When the user asks for a summary of a document or text.
- Translation: When the user asks to translate text into another language.
- Paraphrasing: When the user asks to rephrase or reword a sentence or text.
- Role-play: When the user asks to simulate a conversation or scenario.
- Miscellaneous: For queries that do not fall under any of the above categories.

You are given a user query and you have to classify it into one of these intent categories. Only respond with the intent class. If the query does not match any of the descriptions, output 'Miscellaneous'.
You are not allowed to add a class on your own , JUST USE THE GIVEN INTENTS !

Now take a deep breath and classify the following user query.
"""

classify_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "User question: {question}"),
    ]
)

structured_classifier_llm = llm.with_structured_output(ClassifyUserIntent)

classifier_chain = classify_prompt |structured_classifier_llm




In [62]:
async for chunk in classifier_chain.astream("parrot"):
    print(chunk, end="", flush=True)

intent='Miscellaneous'

In [18]:
from time import time

start = time()

classifier_chain.invoke({
    "question" : "Can you summarize this article for me? "
})

#print(f"Time taken: {time() - start}")

ClassifyUserIntent(intent='Summarization')

In [22]:
from time import time

start = time()

classifier_chain.invoke({
    "question" : "¿Puedes parafrasear este texto para mí? "
})

#print(f"Time taken: {time() - start}")

ClassifyUserIntent(intent='Paraphrasing')

In [31]:
import pandas as pd
import time 

def classify_intent(text):

  result = classifier_chain.invoke({"question": text})
  
  return result.intent

df_head = prompter_df.head()

start= time.time()

df_head['intent'] = df_head['text'].apply(classify_intent)

print(time.time()-start )
print(df_head[['text', 'intent']].head())


38.41150212287903
                                                text         intent
0  Dame los pasos de las cosas que debería de apr...  Miscellaneous
3  Vale, quiero asumir yo sólo todas las tareas. ...  Miscellaneous
5  ¿Cuánto tiempo me va a llevar cada uno de los ...    Translation
7  Para un principiante sin experiencia, ¿Cual es...  Miscellaneous
9  Algo mas específico, ¿devería empezar con Unit...  Miscellaneous


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_head['intent'] = df_head['text'].apply(classify_intent)


In [32]:
batch_results = classifier_chain.batch(df_head['text'].tolist())
print(batch_results) 
print(len(batch_results))
print(len(df_head))

[ClassifyUserIntent(intent='Miscellaneous'), ClassifyUserIntent(intent='Miscellaneous'), ClassifyUserIntent(intent='Translation'), ClassifyUserIntent(intent='Miscellaneous'), ClassifyUserIntent(intent='Miscellaneous')]
5
5


In [38]:
edge_cases = [
    "Can you summarize the plot without revealing the ending?",  # Ambiguous: Paraphrasing or Summarization?
    "Translaye this sentense into englisch, por favor!",        # Multiple misspellings and a mix of languages
    "ThIs Is JuSt A sTrInG oF RaNdom ChArAcTeRs!!! 9876543210",  # Extreme noise with mixed casing
    "Summarize this: Once upon a time in a faraway land, there lived a king and a queen who wanted to have a child. After many years of waiting, they finally had a daughter who was destined for greatness...",  # Long input with storytelling
    "b",                                                       # Very short input (one character)
    " ",                                                      # Input with only whitespace
    "How do I say 'thank you' in French? Also, can you explain why it's important?",  # Mixed intent: Translation and additional request
    "Can you help with the following: \n1) Task a \n2) Task b \n3) Task c? It's pretty straightforward, but I need a hand!",  # Noisy input with formatting
    "Give me a summary of the best way to learn programming, but don't just list resources; explain why each is useful.",  # Complex request with multiple aspects
    "I have a great idea, but I'm not sure how to put it into words! Like, can you suggest some synonyms for 'wonderful' or something?",  # Ambiguous: Seeking synonyms but could be paraphrasing
]

# Stress test on edge cases
for i, case in enumerate(edge_cases):
    try:
        result = classifier_chain.invoke({"question": case})
        print(f"Edge Case {i+1}: '{case}' -> Classified as: {result.intent}")
    except Exception as e:
        print(f"Error processing edge case {i+1}: {e}")


Edge Case 1: 'Can you summarize the plot without revealing the ending?' -> Classified as: Summarization
Edge Case 2: 'Translaye this sentense into englisch, por favor!' -> Classified as: Translation
Edge Case 3: 'ThIs Is JuSt A sTrInG oF RaNdom ChArAcTeRs!!! 9876543210' -> Classified as: Miscellaneous
Edge Case 4: 'Summarize this: Once upon a time in a faraway land, there lived a king and a queen who wanted to have a child. After many years of waiting, they finally had a daughter who was destined for greatness...' -> Classified as: Summarization
Edge Case 5: 'b' -> Classified as: Miscellaneous
Edge Case 6: ' ' -> Classified as: Miscellaneous
Edge Case 7: 'How do I say 'thank you' in French? Also, can you explain why it's important?' -> Classified as: Translation
Edge Case 8: 'Can you help with the following: 
1) Task a 
2) Task b 
3) Task c? It's pretty straightforward, but I need a hand!' -> Classified as: Miscellaneous
Edge Case 9: 'Give me a summary of the best way to learn programm

In [8]:
import pandas as pd
import time

BATCH_SIZE = 10

async def classify_intent_batch(prompter_df, batch_size):
    start = time.time()

    total_instances = len(prompter_df)
    for i in range(0, total_instances, batch_size):
        batch_df = prompter_df.iloc[i:i + batch_size]

        try:
            batch_results = await classifier_chain.abatch(batch_df['text'].tolist())
        except Exception as e:
            print(f"Error during batch classification: {e}")
            batch_results = ['Miscellaneous'] * len(batch_df)

        if isinstance(batch_results[0], str):
            intents = batch_results
        else:
            intents = [result.intent if result is not None else 'Miscellaneous' for result in batch_results]

        prompter_df.loc[batch_df.index, 'intent'] = intents

        for j, intent in enumerate(intents):
            print(f"Processed: {batch_df['text'].iloc[j]} -> Classified as: {intent}")

        print(f"Processed {i + len(batch_df)} / {total_instances} instances")

    print(f"Total time taken: {time.time() - start} seconds")
    return prompter_df



In [9]:
prompter_df_en_fr = prompter_df[prompter_df['lang'].isin(['en', 'fr'])]
prompter_df_en_fr_head = prompter_df_en_fr[41:81]

In [10]:
prompter_df_en_fr_head.loc[:, 'intent'] = None  

prompter_df_en_fr_head = await classify_intent_batch(prompter_df_en_fr_head, BATCH_SIZE)

print(prompter_df_en_fr_head[['text', 'intent']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompter_df_en_fr_head.loc[:, 'intent'] = None  # Safely initialize a new column


Error during batch classification: 'Summarization'
Processed: I tried it and it didn't work. The problem is that when I do the first two steps, I have a white cross and the other sides are not done which makes it impossible to solve it just using your instructions. 
The proper way to solve a rubiks cube would be to start with the white side, then fill in the second layer, then do a yellow cross on the top that lines up with the middle of the colors on the sides, then to fill in the proper corners. -> Classified as: Miscellaneous
Processed: What are the controversies surrounding the Dayan company? -> Classified as: Miscellaneous
Processed: Sure, the name in Chinese characters is 大雁. -> Classified as: Miscellaneous
Processed: Mon amie est considérée comme "harceleuse" dans sa communication avec une soignante. Comment puis-je réagir pour l'aider ?
D'où peut venir un tel comportement, à qui puis-je en parler ? -> Classified as: Miscellaneous
Processed: Can you help me understand how I can 

CancelledError: 

In [42]:
prompter_df_500 = prompter_df[:500]

In [44]:
prompter_df_500.loc[:, 'intent'] = None 

prompter_df_500 = classify_intent_batch(prompter_df_500, BATCH_SIZE)

print(prompter_df_500[['text', 'intent']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompter_df_500.loc[:, 'intent'] = None  # Safely initialize a new column


Processed: Dame los pasos de las cosas que debería de aprender para ser un desarrollador de videojuegos. -> Classified as: Miscellaneous
Processed: Vale, quiero asumir yo sólo todas las tareas. Dame una lista ordenada de las cosas que debo ir haciendo para aprender -> Classified as: Miscellaneous
Processed: ¿Cuánto tiempo me va a llevar cada uno de los pasos? -> Classified as: Summarization
Processed: Para un principiante sin experiencia, ¿Cual es el lenguaje y motor de juegos mas recomendable? -> Classified as: Miscellaneous
Processed: Algo mas específico, ¿devería empezar con Unity?, ¿devería apuntarme a clases de programación o a algún bootcamp o sería mejor aprender por mi cuenta usando internet? me gustaría usar texturas avanzadas para crear juegos en 3 dimensiones. -> Classified as: Miscellaneous
Processed: 能介绍一下什么是特摄吗？ -> Classified as: Miscellaneous
Processed: 你的介绍不错，但是我发现你的回答里有一个意义不明的数字，你能自己把它找出来并解释一下吗？ -> Classified as: Miscellaneous
Processed: 说几个最有名的日本特摄作品，并简要介绍一下其剧情。 -> Cl