## Import libraries

In [1]:
import sys, os

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(os.getcwd(), '../src'))

In [2]:
import re
import time
import json
from tqdm.notebook import tqdm
import pandas as pd
from pandas import DataFrame
from threading import Lock
from typing import Dict

from langchain_groq import ChatGroq
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

from prompt import inquiry_classifying_prompt, keyword_extracting_prompt
from utils import *
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

## Analysing Pipeline

In this step, we will apply 2 methods to extract insightful data from customer's message:

- **Meaningful inquiries**: Use LLM to detect any important, insightful customer's inquiries about products.
- **Extracting keyword**: Use LLM to distil important keywords in messages

We will combine these two methods into a complete pipeline to extract valuable information from customer messages. This pipeline will first classify messages as insightful inquiries, and then extract keywords from those classified messages. This approach allows us to focus on the most relevant information and gain deeper insights into customer needs and preferences.


**LOAD DATA**

In [4]:
messages = load_json('../backup_data/total_message.json')
customer_messages = [m for m in messages if m['from'] == 'customer']

In [5]:
START = 0
N = 1000

sample = customer_messages[START : START + N]

**LOAD LLM**

We will use ***Gemini-1.5-flash*** of Google, which is one of the state of the art LLMs (or even Multimodal model) in the present. Furthermore, this model is also provided a good API capacity for free tier.

Because of requirement of precision and static output, we also need to modify `temperature`, `top_p`, and `top_k` to ensure model work accurately.

In [6]:
config = load_yaml('../config.yaml')
LLM_CONFIG = config['llm']

In [7]:
class LLMCaller:
    """
    A class to manage the rate of requests to an LLM.
    
    This class implements a simple rate limiting mechanism to prevent exceeding the maximum number of requests per minute allowed by the LLM API.
    
    Attributes:
        max_request_per_minute (int): The maximum number of requests allowed per minute.
        _request_counter (int): The number of requests made in the current minute.
        _last_reset_time (float): The timestamp of the last time the request counter was reset.
        _state_lock (Lock): A lock to protect the request counter and last reset time from concurrent access.
    """
    _request_counter = 0
    _last_reset_time = 0.0
    _state_lock = Lock()

    def __init__(self, max_request_per_minute: int):
        self.max_request_per_minute = max_request_per_minute

    def _reset_counter(self):
        current_time = time.time()
        if self._last_reset_time == 0.0 or current_time - self._last_reset_time >= 60:
            self._request_counter = 0
            self._last_reset_time = current_time


    def _increment_counter(self, num_request):
        with self._state_lock:
            self._reset_counter()
            if self._request_counter + num_request > self.max_request_per_minute:
                time.sleep(max(0, self._last_reset_time + 60 - time.time()))
                self._reset_counter()
            self._request_counter += num_request

In [8]:
class GroqAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: ChatPromptTemplate):
        super().__init__(max_request_per_minute=30)

        llm = ChatGroq(**llm_config)
        self.chain = prompt | llm

    def invoke(self, input: dict):
        self._increment_counter(1)

        return self.chain.invoke(input).content


class GoogleAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: PromptTemplate):
        super().__init__(max_request_per_minute=15)

        llm = GoogleGenerativeAI(**llm_config)
        self.chain = prompt | llm


    def invoke(self, input: dict):
        self._increment_counter(1)

        result = self.chain.invoke(input)

        return result

In [9]:
def _parse_llm_output(output: str):
    """
    Parse the output of the LLM.
    
    The output of the LLM is expected to be in either '```python' or '```json' format.
    This function will parse the output and return the result as a dictionary.
    
    Args:
        output (str): The output of the LLM.
    
    Returns:
        dict: The parsed output of the LLM.
    
    Raises:
        Exception: If the output is not in the expected format.
    """
    start = output.index('[')
    end =  len(output) - output[::-1].index(']')

    error_comma = end - 2 if output[end - 1] == ',' else end - 3
    if output[error_comma] == ',':
        output = output[:error_comma] + output[error_comma + 1:]

    try:
        res = json.loads(output[start:end])
    except Exception:
        try:
            res = json.loads(output[start:end].lower())
        except Exception:
            raise Exception(f"Could not parse output. Received: \n{output}")
    return res

### Important Inquiries

In [10]:
def keyword_filter_message(patterns: List[dict], messages: List[str]) -> List[dict]:
    """
    Filter messages that contain any of the given keywords.
    
    Args:
        patterns (List[dict]): A list of keywords to filter.
        messages (List[str]): A list of messages to filter.
    
    Returns:
        List[dict]: A list of messages that do not contain any of the given keywords.
    """
    synthetic_pattern = r'\b(' + '|'.join(patterns) + r')\b'
    result = [m for m in messages if not re.search(synthetic_pattern, m['message'])]

    return result

In [11]:
def classify_inquiry_pipeline(messages: List[dict],
                              min_score: float,
                              batch_size: int = 50,
                              provider: Literal['google', 'groq'] = 'groq') -> Tuple[List[dict], List[dict]]:
    # classify by LLM
    if provider == 'google':
        chain = GoogleAICaller(LLM_CONFIG[provider], inquiry_classifying_prompt)
    else:
        chain = GroqAICaller(LLM_CONFIG[provider], inquiry_classifying_prompt)

    mask = []
    for i in tqdm(range(0, len(messages), batch_size), desc='Detecting insightful inquiry'):
        end_idx = min(len(messages), i + batch_size)
        try:
            response = chain.invoke({'input': str([m['message'] for m in messages[i : end_idx]])})
        except Exception:
            print(f'Error while generating response for batch {i} - {end_idx}')
            
            mask += ['error' for _ in range(i, end_idx)]
            continue

        try:
            parsed_response = _parse_llm_output(response)
            mask += [list(r.items())[0][1] for r in parsed_response]
        except Exception:
            print(f'Error while parsing LLM output for batch {i} - {end_idx}')
            print(response)
            mask += ['error' for _ in range(i, end_idx)]
    
    # get output to return
    classified_messages = [m for m, l in zip(messages, mask) if l >= min_score]
    error_messages = [m for m, l in zip(messages, mask) if l == 'error']

    return classified_messages, error_messages

### Extracting keywords

In [12]:
def handle_template_message(templates: Dict[str, Dict[str, str]], messages: List[dict]) -> Tuple[List[dict], List[dict]]:
    """
    Handle template messages.

    This function iterates through a list of messages and checks if each message is a key in the `templates` dictionary.
    If a message is found in the `templates` dictionary, it updates the message with the corresponding template information
    and appends it to the `template_message` list. Otherwise, it appends the message to the `other_message` list.

    Args:
        templates (Dict[str, Dict[str, str]]): A dictionary of template messages, where the key is the message string
            and the value is a dictionary containing the user and purpose information.
        messages (List[dict]): A list of messages to be processed.

    Returns:
        Tuple[List[dict], List[dict]]: A tuple containing two lists:
            - `template_message`: A list of messages that were found in the `templates` dictionary.
            - `other_message`: A list of messages that were not found in the `templates` dictionary.
    """
    template_message = []
    other_message = []
    for m in messages:
        key = m['message'].lower()
        if key in templates:
            m.update(templates[key])
            template_message.append(m)
        else:
            other_message.append(m)

    return template_message, other_message

In [13]:
def extract_keyword_pipeline(messages: List, 
                             batch_size: int = 50, 
                             provider: Literal['google', 'groq'] = 'groq') -> Tuple[List[dict], List[dict]]:
    if provider == 'google':
        chain = GoogleAICaller(LLM_CONFIG[provider], keyword_extracting_prompt)
    else:
        chain = GroqAICaller(LLM_CONFIG[provider], keyword_extracting_prompt)
    
    keywords = []
    for i in tqdm(range(0, len(messages), batch_size), desc='Extracting keywords'):
        end_idx = min(len(messages), i + batch_size)
        try:
            response = chain.invoke({'input': str([m['message'] for m in messages[i : end_idx]])})
        
        except Exception:
            print(f'Error while generating response for batch {i} - {end_idx}')
            keywords += ['error' for _ in range(i, end_idx)]
            continue

        try:
            parsed_response = _parse_llm_output(response)
            keywords += parsed_response

        except Exception as exc:
            print(f'Error while parsing LLM output for batch {i} - {end_idx}: {exc}')
            keywords += ['error' for _ in range(i, end_idx)]

    extracted_messages = []
    error_messages = []
    for mess, kw_item in zip(messages, keywords):
        if kw_item != 'error':
            k, v = list(kw_item.items())[0]
            if all(len(x) > 0 for x in v.values()):
                extracted_messages.append(mess.copy())
                extracted_messages[-1].update(v)
        else:
            error_messages.append(mess.copy())

    return extracted_messages, error_messages

### Entire Pipeline

In [14]:
def analyse_message_pipeline(messages: List[dict],
                             filter_patterns: Optional[List[str]] = None, 
                             template_messages: Optional[dict] = None,
                             important_score: Optional[float] = 0.7,
                             batch_size: int = 50,
                             provider: Literal['google', 'groq'] = 'groq'):
    # Initialize results
    processed_messages = []
    error_messages = []

    # start processing
    if filter_patterns is not None:
        messages = keyword_filter_message(filter_patterns, messages)

    if template_messages is not None:
        template, messages = handle_template_message(template_messages, messages)
        processed_messages += template

    messages, error = classify_inquiry_pipeline(messages, important_score, batch_size, provider)
    error_messages += error

    messages, error = extract_keyword_pipeline(messages, batch_size, provider)
    error_messages += error
    processed_messages += messages

    return processed_messages, error_messages

## Analysing

In [15]:
result = processed_messages, error_messages = analyse_message_pipeline(
    sample,
    config['filter-message-keywords'],
    config['template-message'],
    important_score=config['important-score'],
    provider='google'
)

Detecting insightful inquiry:   0%|          | 0/16 [00:00<?, ?it/s]

Error while parsing LLM output for batch 0 - 50
```json
[{"message": "0363776710": 0.0}, {"message": "dạ ko cần ạ": 0.0}, {"message": "dạ 10h30 đến 11h nhé": 0.0}, {"message": "dạ vâng": 0.0}, {"message": "Bếp hnay hoạt động lại chưa ạ": 0.0}, {"message": "Loại tiểu bảo gồm có thành phần gì ạ": 0.3}, {"message": "Bình long tân phú nhận kịp 4h30 ko ạ": 0.0}, {"message": "196k pk ạ": 0.0}, {"message": "Kịp ko ạ": 0.0}, {"message": "Tại em làm ở cty 4h30 em ra ca về rồi ạ": 0.0}, {"message": "Tại 4h30 em mới ra ca": 0.0}, {"message": "32 bình long quận tân phú\n0377428748": 0.0}, {"message": "Ko ạ": 0.0}, {"message": "Ship sao tới đây 4h 30 dùm em nha": 0.0}, {"message": "Sớm quá em ko nhận đc": 0.0}, {"message": "Đơn hàng của em đi chưa ạ": 0.0}, {"message": "Ok shop": 0.0}, {"message": "Không ạ , em rất hài lòng về món ăn và cách đóng gói ạ 🥰": 0.7}, {"message": "Vâng ạ em nhận đủ": 0.0}, {"message": "Vâng ạ lần sau em sẽ ủng hộ tiếp món khác ạ": 0.7}, {"message": "bồi bổ ăn cái nào ạ":

KeyboardInterrupt: 