## Import libraries

In [1]:
import sys, os

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(os.getcwd(), '../src'))

In [2]:
import re
import time
import json
from tqdm.notebook import tqdm
import pandas as pd
from pandas import DataFrame
from threading import Lock
from typing import Dict
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain_groq import ChatGroq
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

from prompt import classifying_inquiry_prompt, reclassifying_inquiry_prompt, extracting_user_purpose_prompt
from utils import *
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

## Analysing Pipeline

In this step, we will apply 2 methods to extract insightful data from customer's message:

- **Meaningful inquiries**: Use LLM to detect any important, insightful customer's inquiries about products.
- **Extracting keyword**: Use LLM to distil important keywords in messages

We will combine these two methods into a complete pipeline to extract valuable information from customer messages. This pipeline will first classify messages as insightful inquiries, and then extract keywords from those classified messages. This approach allows us to focus on the most relevant information and gain deeper insights into customer needs and preferences.


### Load data

In [4]:
messages = load_json('../backup_data/total_message.json')
customer_messages = [m for m in messages if m['from'] == 'customer']

In [5]:
START = 0
N = 1000

sample = customer_messages[START : START + N]

### Load LLM

We will use ***Gemini-1.5-flash*** of Google, which is one of the state of the art LLMs (or even Multimodal model) in the present. Furthermore, this model is also provided a good API capacity for free tier.

Because of requirement of precision and static output, we also need to modify `temperature`, `top_p`, and `top_k` to ensure model work accurately.

In [6]:
config = load_yaml('../config.yaml')
LLM_CONFIG = config['llm']

In [7]:
class LLMCaller:
    """
    A class to manage the rate of requests to an LLM.
    
    This class implements a simple rate limiting mechanism to prevent exceeding the maximum number of requests per minute allowed by the LLM API.
    
    Attributes:
        max_request_per_minute (int): The maximum number of requests allowed per minute.
        _request_counter (int): The number of requests made in the current minute.
        _last_reset_time (float): The timestamp of the last time the request counter was reset.
        _state_lock (Lock): A lock to protect the request counter and last reset time from concurrent access.
    """
    _request_counter = 0
    _last_reset_time = 0.0
    _state_lock = Lock()

    def __init__(self, max_request_per_minute: int):
        self.max_request_per_minute = max_request_per_minute

    def _reset_counter(self) -> None:
        current_time = time.time()
        if self._last_reset_time == 0.0 or current_time - self._last_reset_time >= 60:
            self._request_counter = 0
            self._last_reset_time = current_time

    def _wait_to_next_minute(self) -> None:
        """
        Wait until the start of the next minute.
        """
        wait_time = max(0, self._last_reset_time + 60 - time.time())
        time.sleep(wait_time)
        self._reset_counter()

    def _increment_counter(self, num_request: int) -> None:
        with self._state_lock:
            self._reset_counter()
            if self._request_counter + num_request > self.max_request_per_minute:
                self._wait_to_next_minute()
            self._request_counter += num_request

In [8]:
class GroqAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: ChatPromptTemplate):
        super().__init__(max_request_per_minute=30)
        
        config = {'max_retries': 0}
        config.update(llm_config)

        llm = ChatGroq(**config)
        self.chain = prompt | llm

    def _extract_error_code(self, exception: Exception) -> Optional[int]:
        try:
            error_code = exception.status_code
        except Exception:
            error_code = None
        
        return error_code

    def invoke(self, input: dict) -> str:
        self._increment_counter(1)
        try:
            result = self.chain.invoke(input).content
        except Exception as exc:
            if self._extract_error_code(exc) == 429:
                print('Reaching maximum resources, wait to next minutes!')
                self._wait_to_next_minute()
            
            result = self.chain.invoke(input).content
        
        return result


class GoogleAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: PromptTemplate):
        super().__init__(max_request_per_minute=15)
        
        config = {'max_retries': 0}
        config.update(llm_config)
        
        llm = GoogleGenerativeAI(**config)
        self.chain = prompt | llm

    
    def _extract_error_code(self, exception: Exception) -> Optional[int]:
        try:
            error_code = exception.code.value
        except Exception:
            error_code = None

        return error_code
        

    def invoke(self, input: dict) -> str:
        self._increment_counter(1)
        try:
            result = self.chain.invoke(input)
        except Exception as exc:
            if self._extract_error_code(exc) == 429:
                print('Reaching maximum resources, wait to next minutes!')
                self._wait_to_next_minute()
                result = self.chain.invoke(input)

            raise exc
        
        return result

In [9]:
def _parse_llm_output(output: str):
    """
    Parse the output of the LLM.
    
    The output of the LLM is expected to be in either '```python' or '```json' format.
    This function will parse the output and return the result as a dictionary.
    
    Args:
        output (str): The output of the LLM.
    
    Returns:
        dict: The parsed output of the LLM.
    
    Raises:
        Exception: If the output is not in the expected format.
    """
    start = output.index('[')
    end =  len(output) - output[::-1].index(']')

    error_comma = end - 2 if output[end - 1] == ',' else end - 3
    if output[error_comma] == ',':
        output = output[:error_comma] + output[error_comma + 1:]

    try:
        res = json.loads(output[start:end])
    except Exception:
        try:
            res = json.loads(output[start:end].lower())
        except Exception:
            raise Exception(f"Could not parse output. Received: \n{output}")
    return res

### Filter by pattern

In [10]:
def keyword_filter(patterns: List[str], messages: List[dict], get_keyword: Optional[bool] = True) -> List[dict]:
    """
    Filter messages based on the presence or absence of specified keywords.

    This function iterates through a list of messages and checks if each message contains any of the given keywords.
    It returns a list of messages that either contain or do not contain the specified keywords, depending on the `get_keyword` flag.

    Args:
        patterns (List[str]): A list of keywords to filter by.
        messages (List[dict]): A list of messages to filter.
        get_keyword (Optional[bool], optional): If True, returns messages containing the keywords. 
            If False, returns messages not containing the keywords. Defaults to True.

    Returns:
        List[dict]: A list of messages that meet the filtering criteria.
    """
    synthetic_pattern = r'\b(' + '|'.join(patterns) + r')\b'
    result = [m for m in messages 
              if bool(re.search(synthetic_pattern, m['message'].lower())) == get_keyword]

    return result

### Important Inquiries

In [11]:
def classify_inquiry_pipeline(
    messages: List[dict],
    min_score: float,
    batch_size: int = 50,
    provider: Literal["google", "groq"] = "groq",
) -> Tuple[List[dict], List[dict]]:
    # classify by LLM
    if provider == "google":
        chain = GoogleAICaller(LLM_CONFIG[provider], classifying_inquiry_prompt)
    else:
        chain = GroqAICaller(LLM_CONFIG[provider], classifying_inquiry_prompt)

    mask = [None for _ in range(len(messages))]
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        future = {
            executor.submit(
                lambda : chain.invoke({"input": str([m["message"] for m in messages[i : min(i + batch_size, len(messages))]])})
            ): i
            for i in range(0, len(messages), batch_size)
        }
        for f in tqdm(as_completed(future), total=len(future), desc="Detecting insightful inquiry"):
            i = future[f]
            end_idx = min(i + batch_size, len(messages))
            try:
                response = f.result()
            except Exception as exc:
                print(f"Error while generating response for batch {i} - {end_idx - 1}")

                # update mask
                for i in range(i, end_idx):
                    mask[i] = 'error'
                continue

            try:
                parsed_response = _parse_llm_output(response)
                # update mask
                for i, idx in enumerate(range(i, end_idx)):
                    mask[idx] = parsed_response[i][messages[idx]['message']]

            except Exception:
                print(f"Error while parsing LLM output for batch {i} - {end_idx - 1}")
                
                # update mask
                for i in range(i, end_idx):
                    mask[i] = 'error'
                continue

    # get output to return
    classified_messages = [
        m for m, l in zip(messages, mask) 
        if l != 'error' and l >= min_score
    ]
    error_messages = [m for m, l in zip(messages, mask) if l == "error"]

    return classified_messages, error_messages

In [12]:
def reclassify_inquiry_pipeline(
    messages: List[dict],
    min_score: float,
    batch_size: int = 50,
    provider: Literal["google", "groq"] = "groq",
) -> Tuple[List[dict], List[dict]]:
    # classify by LLM
    if provider == "google":
        chain = GoogleAICaller(LLM_CONFIG[provider], reclassifying_inquiry_prompt)
    else:
        chain = GroqAICaller(LLM_CONFIG[provider], reclassifying_inquiry_prompt)

    mask = [None for _ in range(len(messages))]
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        future = {
            executor.submit(
                lambda : chain.invoke({"input": str([m["message"] for m in messages[i : min(i + batch_size, len(messages))]])})
            ): i
            for i in range(0, len(messages), batch_size)
        }
        for f in tqdm(as_completed(future), total=len(future), desc="Detecting insightful inquiry"):
            i = future[f]
            end_idx = min(i + batch_size, len(messages))
            try:
                response = f.result()
            except Exception as exc:
                print(f"Error while generating response for batch {i} - {end_idx - 1}")

                # update mask
                for i in range(i, end_idx):
                    mask[i] = 'error'
                continue

            try:
                parsed_response = _parse_llm_output(response)
                # update mask
                for i, idx in enumerate(range(i, end_idx)):
                    mask[idx] = parsed_response[i][messages[idx]['message']]

            except Exception:
                print(f"Error while parsing LLM output for batch {i} - {end_idx - 1}")
                
                # update mask
                for i in range(i, end_idx):
                    mask[i] = 'error'
                continue

    # get output to return
    classified_messages = [
        m for m, l in zip(messages, mask) 
        if l != 'error' and l >= min_score
    ]
    error_messages = [m for m, l in zip(messages, mask) if l == "error"]

    return classified_messages, error_messages

### Extracting keywords

In [13]:
def handle_template_message(templates: Dict[str, Dict[str, str]], messages: List[dict]) -> Tuple[List[dict], List[dict]]:
    """
    Handle template messages.

    This function iterates through a list of messages and checks if each message is a key in the `templates` dictionary.
    If a message is found in the `templates` dictionary, it updates the message with the corresponding template information
    and appends it to the `template_message` list. Otherwise, it appends the message to the `other_message` list.

    Args:
        templates (Dict[str, Dict[str, str]]): A dictionary of template messages, where the key is the message string
            and the value is a dictionary containing the user and purpose information.
        messages (List[dict]): A list of messages to be processed.

    Returns:
        Tuple[List[dict], List[dict]]: A tuple containing two lists:
            - `template_message`: A list of messages that were found in the `templates` dictionary.
            - `other_message`: A list of messages that were not found in the `templates` dictionary.
    """
    template_message = []
    other_message = []
    for m in messages:
        key = m['message'].lower()
        if key in templates:
            m.update(templates[key])
            template_message.append(m)
        else:
            other_message.append(m)

    return template_message, other_message

In [14]:
def extract_user_purpose_pipeline(messages: List, 
                             batch_size: int = 50, 
                             provider: Literal['google', 'groq'] = 'groq') -> Tuple[List[dict], List[dict]]:
    if provider == 'google':
        chain = GoogleAICaller(LLM_CONFIG[provider], extracting_user_purpose_prompt)
    else:
        chain = GroqAICaller(LLM_CONFIG[provider], extracting_user_purpose_prompt)
    
    user_and_purpose = [None for _ in range(len(messages))]
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        future = {
            executor.submit(
                lambda : chain.invoke({"input": str([m["message"] for m in messages[i : min(i + batch_size, len(messages))]])})
            ): i
            for i in range(0, len(messages), batch_size)
        }
        for f in tqdm(as_completed(future), total=len(future), desc='Extracting keywords'):
            i = future[f]
            end_idx = min(len(messages), i + batch_size)
            try:
                response = f.result()
            
            except Exception:
                print(f'Error while generating response for batch {i} - {end_idx - 1}')
                # update `user_and_purpose`
                for idx in range(i, end_idx):
                    user_and_purpose[idx] = 'error'
                continue

            try:
                parsed_response = _parse_llm_output(response)
                # update `user_and_purpose`
                for i, idx in enumerate(range(i, end_idx)):
                    user_and_purpose[idx] = parsed_response[i][messages[idx]['message']].copy()

            except Exception as exc:
                print(f'Error while parsing LLM output for batch {i} - {end_idx - 1}')
                # update `user_and_purpose`
                for idx in range(i, end_idx):
                    user_and_purpose[idx] = 'error'

    extracted_messages = []
    error_messages = []
    for mess, u_and_p in zip(messages, user_and_purpose):
        extracted_messages.append(mess.copy())
        if u_and_p != 'error':
            extracted_messages[-1].update(u_and_p)
        
    return extracted_messages, error_messages

### Entire Pipeline

In [15]:
def analyse_message_pipeline(messages: List[dict],
                             remove_keywords: List[str] = None,
                             filter_keywords: List[str] = None,
                             template: Optional[dict] = None,
                             important_score: Optional[float] = 0.7,
                             batch_size: int = 50,
                             provider: Literal['google', 'groq'] = 'groq'):
    # Initialize results
    processed_messages = []
    error_messages = []

    # start processing
    if remove_keywords:
        messages = keyword_filter(remove_keywords, messages, get_keyword=False)

    if filter_keywords:
        messages = keyword_filter(filter_keywords, messages, get_keyword=True)

    template_messages = None
    if template is not None:
        template_messages, messages = handle_template_message(template, messages)

    messages, error = classify_inquiry_pipeline(messages, important_score, batch_size, provider)
    error_messages += error
    messages, error = reclassify_inquiry_pipeline(messages, important_score, batch_size)
    error_messages += error

    messages, error = extract_user_purpose_pipeline(messages, batch_size, provider)
    error_messages += error
    processed_messages += messages

    return template_messages, processed_messages, error_messages

## Analysing

In [16]:
important_keywords = config['product-keywords'] + config['important-message-keywords']

template_messages, processed_messages, error_messages = analyse_message_pipeline(
    sample,
    remove_keywords=config['unimportant-message-keywords'],
    filter_keywords=important_keywords,
    template=config['template-message'],
    important_score=config['important-score'],
    provider='google'
)

I0000 00:00:1726857239.865356 1157140 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


Detecting insightful inquiry:   0%|          | 0/3 [00:00<?, ?it/s]

I0000 00:00:1726857240.515410 1157249 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1726857240.524004 1157248 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


Detecting insightful inquiry:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting keywords:   0%|          | 0/1 [00:00<?, ?it/s]