## Import libraries

In [1]:
import sys, os

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(os.getcwd(), '../src'))

In [2]:
import re
import time
import json
from tqdm.notebook import tqdm
import pandas as pd
from pandas import DataFrame
from threading import Lock

from langchain_groq import ChatGroq
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

from prompt import inquiry_classifying_prompt, keyword_extracting_prompt
from utils import *
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

## Extract

In this step, we will apply 2 methods to extract insightful data from customer's message:
- **Meaningful inquiries**: Use LLM to detect any important, insightful customer's inquiries about products.
- **Extracting keyword**: Use LLM to distil important keywords in messages

**LOAD DATA**

In [4]:
messages = load_json('../data/total_message.json')
customer_messages = [m for m in messages if m['from'] == 'customer']

In [5]:
N = 1000
START = 0

sample = [m['message'] for m in customer_messages[START : START + N]]

**LOAD LLM**

We will use ***Gemini-1.5-flash*** of Google, which is one of the state of the art LLMs (or even Multimodal model) in the present. Furthermore, this model is also provided a good API capacity for free tier.

Because of requirement of precision and static output, we also need to modify `temperature`, `top_p`, and `top_k` to ensure model work accurately.

In [6]:
class LLMCaller:
    """
    A class to manage the rate of requests to an LLM.
    
    This class implements a simple rate limiting mechanism to prevent exceeding the maximum number of requests per minute allowed by the LLM API.
    
    Attributes:
        max_request_per_minute (int): The maximum number of requests allowed per minute.
        _request_counter (int): The number of requests made in the current minute.
        _last_reset_time (float): The timestamp of the last time the request counter was reset.
        _state_lock (Lock): A lock to protect the request counter and last reset time from concurrent access.
    """
    _request_counter = 0
    _last_reset_time = 0.0
    _state_lock = Lock()

    def __init__(self, max_request_per_minute: int):
        self.max_request_per_minute = max_request_per_minute

    def _reset_counter(self):
        current_time = time.time()
        if self._last_reset_time == 0.0 or current_time - self._last_reset_time >= 60:
            self._request_counter = 0
            self._last_reset_time = current_time


    def _increment_counter(self, num_request):
        with self._state_lock:
            self._reset_counter()
            if self._request_counter + num_request > self.max_request_per_minute:
                time.sleep(max(0, self._last_reset_time + 60 - time.time()))
                self._reset_counter()
            self._request_counter += num_request

In [7]:
class GroqAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: ChatPromptTemplate):
        super().__init__(max_request_per_minute=30)

        llm = ChatGroq(**llm_config)
        self.chain = prompt | llm

    def invoke(self, input: dict):
        self._increment_counter(1)

        return self.chain.invoke(input).content


class GoogleAICaller(LLMCaller):
    def __init__(self, llm_config: dict, prompt: PromptTemplate):
        super().__init__(max_request_per_minute=15)

        llm = GoogleGenerativeAI(**llm_config)
        self.chain = prompt | llm


    def invoke(self, input: dict):
        self._increment_counter(1)

        result = self.chain.invoke(input)

        return result

In [8]:
def parse_output(output: str):
    """
    Parse the output of the LLM.
    
    The output of the LLM is expected to be in either '```python' or '```json' format.
    This function will parse the output and return the result as a dictionary.
    
    Args:
        output (str): The output of the LLM.
    
    Returns:
        dict: The parsed output of the LLM.
    
    Raises:
        Exception: If the output is not in the expected format.
    """
    start = output.index('[')
    end =  len(output) - output[::-1].index(']')

    error_comma = end - 2 if output[end - 1] == ',' else end - 3
    if output[error_comma] == ',':
        output = output[:error_comma] + output[error_comma + 1:]

    try:
        res = json.loads(output[start:end])
    except Exception:
        try:
            res = json.loads(output[start:end].lower())
        except Exception:
            raise Exception(f"Could not parse output. Expected output in either '```python' or '```json' format. Received: \n{output}")
    return res

In [9]:
provider = 'groq'
llm_config = load_yaml('../config.yaml')['llm'][provider]

### Important Inquiries

In [10]:
def parse_llm_output(output: str):
    """
    Parse the output of the LLM.
    
    The output of the LLM is expected to be in either '```python' or '```json' format.
    This function will parse the output and return the result as a dictionary.
    
    Args:
        output (str): The output of the LLM.
    
    Returns:
        dict: The parsed output of the LLM.
    
    Raises:
        Exception: If the output is not in the expected format.
    """
    start = output.index('[')
    end =  len(output) - output[::-1].index(']')

    error_comma = end - 2 if output[end - 1] == ',' else end - 3
    if output[error_comma] == ',':
        output = output[:error_comma] + output[error_comma + 1:]
        
    try:
        res = json.loads(output[start:end])
    except Exception:
        try:
            res = json.loads(output[start:end].lower())
        except Exception:
            raise Exception(f"Could not parse output. Received: \n{output}")
    return res

In [11]:
def prefilter_messages(patterns: List[str], messages: List[str]):
    synthetic_pattern = r'\b(' + '|'.join(patterns) + r')\b'
    result = []
    for m in messages:
        if re.search(synthetic_pattern, m.lower()):
            result.append(False)
        else:
            result.append(True)

    return result

In [12]:
# pre-filter common messages by keywords
common_keywords = ['giao', 'ship', 'địa chỉ', 'dia chi', 'giá', 'stk', 'số tài khoản', 
                    'so tai khoan', 'thanh toán', 'chuyển khoản', 'tiền mặt', 'menu', 'alo', 'hi',
                    'ok', 'shjp', 'quét']
result = prefilter_messages(common_keywords, sample)

redetect_message = [m for m, r in zip(sample, result) if r]
redetect_index = [i for i in range(len(result)) if result[i]]

In [13]:
# classify by LLM
mask = []
batch_size = 30
chain = GroqAICaller(llm_config, inquiry_classifying_prompt)
for i in tqdm(range(0, len(redetect_message), batch_size), desc='Detecting insightful inquiry'):
    end_idx = min(len(redetect_message), i + batch_size)
    try:
        response = chain.invoke({'input': str(redetect_message[i : end_idx])})
    except Exception:
        print(f'Error while generating response for batch {i} - {end_idx}')
        
        mask += ['error' for _ in range(i, end_idx)]
        continue

    try:
        parsed_response = parse_llm_output(response)
        mask += [list(r.items())[0][1] for r in parsed_response]
    except Exception as exc:
        print(f'Error while parsing LLM output for batch {i} - {end_idx}: {exc}')
        print(response)
        mask += ['error' for _ in range(i, end_idx)]

# synthesize mask
for i, j in enumerate(redetect_index):
    result[j] = mask[i]

Detecting insightful inquiry:   0%|          | 0/25 [00:00<?, ?it/s]

In [14]:
important_messages = [m for m, l in zip(customer_messages[START : START + N], result) if l >= 0.8]

### Extracting keywords

In [15]:
message_list = [m['message'] for m in important_messages]

In [16]:
# extracting by LLM
keywords = []
batch_size = 30
chain = GroqAICaller(llm_config, keyword_extracting_prompt)
for i in tqdm(range(0, len(message_list), batch_size), desc='Detecting insightful inquiry'):
    end_idx = min(len(message_list), i + batch_size)
    try:
        response = chain.invoke({'input': str(message_list[i : end_idx])})
    except Exception:
        print(f'Error while generating response for batch {i} - {end_idx}')
        
        keywords += ['error' for _ in range(i, end_idx)]
        continue

    try:
        parsed_response = parse_llm_output(response)
        keywords += parsed_response
    except Exception as exc:
        print(f'Error while parsing LLM output for batch {i} - {end_idx}: {exc}')
        print(response)
        keywords += ['error' for _ in range(i, end_idx)]

Detecting insightful inquiry:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
keywords

[{'bồi bổ ăn cái nào ạ': {'user': [], 'purpose': ['bồi bổ']}},
 {'kh phải ng lớn tuổi': {'user': ['người lớn tuổi'], 'purpose': []}},
 {'trẻ chớ kh phải lớn thổi ý': {'user': ['trẻ'], 'purpose': []}},
 {'Mẹ Bầu ăn có tốt không?': {'user': ['mẹ bầu'], 'purpose': ['bồi bổ']}},
 {'Đặt Súp Bào Ngư thăm người Ốm!': {'user': ['người ốm'],
   'purpose': ['bồi dưỡng bệnh']}},
 {'Mẹ Bầu ăn có tốt không?': {'user': ['mẹ bầu'], 'purpose': ['bồi bổ']}},
 {'Đặt Súp Bào Ngư tẩm bổ cho Gia Đình / Người Thân / Đối Tác': {'user': ['gia đình',
    'người thân',
    'đối tác'],
   'purpose': ['tẩm bổ']}},
 {'Đặt Súp Bào Ngư thăm người Ốm!': {'user': ['người ốm'],
   'purpose': ['bồi dưỡng bệnh']}},
 {'món bạch yến càn long này người già lớn tuổi ăn hợp không em': {'user': ['người già',
    'người lớn tuổi'],
   'purpose': ['bồi bổ']}},
 {'bà 90 tuổi rồi có bị dai không': {'user': ['người già'],
   'purpose': ['bồi bổ']}},
 {'Đặt dùng cho bầu ăn': {'user': ['mẹ bầu'], 'purpose': ['bồi bổ']}},
 {'Đặt Súp B

In [31]:
tmp = important_messages.copy()

In [33]:
for mess, kw_item in zip(tmp, keywords):
    k, v = list(kw_item.items())[0]
    mess.update(v)

### Convert to dataframe

In [18]:
def create_dataframe(messages: List[dict]) -> DataFrame:
    df = pd.DataFrame(data=messages)

    # convert string to datetime
    df['inserted_at'] = df['inserted_at'].str.replace('T', ' ')
    df['inserted_at'] = df['inserted_at'].str.replace(r'\.\d{6}', '', regex=True)
    df['inserted_at'] = pd.to_datetime(df['inserted_at'], format='%Y-%m-%d %H:%M:%S')
    
    # transform timezone
    df['inserted_at'] = df['inserted_at'].dt.tz_localize('UTC')
    df['inserted_at'] = df['inserted_at'].dt.tz_convert('Asia/Bangkok')
    df['inserted_at'] = df['inserted_at'].dt.tz_localize(None)
    
    return df

In [35]:
df = create_dataframe(tmp)
df.head(5)

Unnamed: 0,message,inserted_at,from,user,purpose
0,bồi bổ ăn cái nào ạ,2024-08-21 16:03:31,customer,[],[bồi bổ]
1,kh phải ng lớn tuổi,2024-08-21 16:03:38,customer,[người lớn tuổi],[]
2,trẻ chớ kh phải lớn thổi ý,2024-08-21 16:06:58,customer,[trẻ],[]
3,Mẹ Bầu ăn có tốt không?,2024-09-02 14:43:27,customer,[mẹ bầu],[bồi bổ]
4,Đặt Súp Bào Ngư thăm người Ốm!,2024-08-26 18:53:24,customer,[người ốm],[bồi dưỡng bệnh]


In [36]:
df.to_excel('full_sample.xlsx', index=False)