In [None]:
%pip install emoji

In [None]:
import emoji
import re

# Example chat log
chat_log = "Hello there \ud83d\ude04! How are you \ud83d\ude0a?"

def decode_emojis(text):
    # Match Unicode escape sequences (e.g., \ud83d\ude04)
    emoji_pattern = re.compile(r'(\\u[\da-fA-F]{4}\\u[\da-fA-F]{4})')

    def decode_match(match):
        # Decode each matched emoji escape sequence
        return match.group(0).encode('utf-16', 'surrogatepass').decode('utf-16')

    # Replace all encoded emojis with actual emojis
    return emoji_pattern.sub(decode_match, text)

def demojize_chatlog(chat_log):
    # Decode emojis
    decoded_text = decode_emojis(chat_log)
    # Convert emojis to text representations
    return emoji.demojize(decoded_text)

In [None]:
import json
from datetime import datetime

def format_chat_message(chat_message):
    chat_message = re.sub(r'[\u202F\u2011\u202a\u202c\u200e]', ' ', chat_message)
    chat_message = re.sub(r'[\u2018\u2019]', "'", chat_message)
    # Extract the components of the chat message
    if ']' in chat_message:
        timestamp_str, rest = chat_message.strip().split('] ', 1)
        timestamp_str = timestamp_str[1:]  # Remove the leading '['
        sender, message = rest.split(': ', 1)
    # Check if sender is a phone number or has the prefix "cust"
        if re.match(r'\+\d{1,4}(?:\s\d+)+', sender.strip()) or sender.lower().strip().startswith('cust'):
            sender = 'user'
        else:
            sender = 'agent'    

        # Convert the timestamp to ISO 8601 format
        try:
            # Try 24-hour format first
            timestamp = datetime.strptime(timestamp_str, '%d/%m/%Y, %H:%M:%S').isoformat()
            # print("converting"+timestamp_str+" with 24 hour format")
        except ValueError:
            # If it fails, try 12-hour format with AM/PM
            # print("converting"+timestamp_str+" with 12 hour format")
            timestamp = datetime.strptime(timestamp_str, '%d/%m/%Y, %I:%M:%S %p').isoformat()
        
        # Create the object
        if 'attached' in message and '.jpg' in message or "image omitted" in message:
            return {"timestamp": timestamp,"role": sender.strip(),"message": "<|image|>"}
        demojized_message = demojize_chatlog(message.strip())
        return {"timestamp": timestamp,"role": sender.strip(),"message": demojized_message}
    if 'attached' in chat_message and '.jpg' in chat_message or "image omitted" in chat_message:
        return {"role": None,"message": "<|image|>"}
    demojized_message = demojize_chatlog(chat_message.strip())
    return {"role": None,"message": demojized_message}

# Example usage
# chat_message = "[28/10/2024, 5:01:54 PM] ‪+60 12‑3456 7890‬: ‎Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them."
# formatted_message = format_chat_message(chat_message)
# print(formatted_message)

In [None]:
# read text files in data/chatlogs
files = ["_chat.txt", "_chat 2.txt", "_chat 3.txt"]
chatlogs = []
for file in files:
    previous_message = None
    chatlog = []
    with open("../data/chatlogs/"+file, "r") as f:
        file_messages = f.readlines()
        last_message = demojize_chatlog(file_messages[-1].split(": ")[1].strip())
        for line in file_messages[1:]:
            if "uses a default timer for disappearing messages" in line: #remove system message
                continue
            if (line.strip().strip("\n") == "" )&(previous_message != None): #in case there are multiline messages with empty lines
                previous_message["message"] += " \n "
                continue
            message = format_chat_message(line)

            # treat consecutive messages from the same user as one message
            
            if previous_message is None:
                previous_message = message
                continue

            if message["role"] == None: #concatenate next line of the multiline message# 
                previous_message["message"] += message["message"] 
                if last_message in previous_message["message"]: chatlog.append(previous_message)
            elif previous_message["role"] == message["role"]: # concatenate consecutive messages
                previous_message["message"] += "<|USER_MSG|>" + message["message"] 
                if last_message in previous_message["message"]: chatlog.append(previous_message)
            else:
                chatlog.append(previous_message)
                if last_message in message["message"]: chatlog.append(message)#detect if last message is detected
                previous_message = message #switch users

    chatlogs.append(chatlog)

for chatlog in chatlogs:
    for message in chatlog:
        print(message['message'])


In [None]:
for i, chatlog in enumerate(chatlogs):
    with open(f'../data/cleaned_chatlogs/chatlog{i}.json', 'w') as f:
        f.write('[\n')
        for message in chatlog:
            f.write(json.dumps(message))
            f.write('\n') if chatlog.index(message) == len(chatlog)-1 else f.write(',\n')
        f.write(']')

In [None]:
# %pip install -qU ollama

import ollama
from concurrent.futures import ThreadPoolExecutor

# Function to send prompt to Ollama and get a response
def mask_data(prompt_text: str, model="llama3.2"):
    response = ollama.generate(
        model=model,
        prompt=prompt_text,
        stream=False,
        system="""You are a function that masks all personal information to protect customer privacy. Use:
[NAME] for names, [LOCATION] for places, [DATE] for dates, [NUMBER] for numeric values.
Input: "{{text}}"
Output:
<|start_header_id|>assistant<|end_header_id>"""
    )
    return response['response']

# Function to anonymize personal info
def anonymize_personal_info(data):
    # Prepare prompts and execute in parallel
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(mask_data, data))

    return results


In [None]:
# %pip install -q "presidio_analyzer[transformers]"
# %pip install -q presidio_anonymizer
# %%python -m spacy download en_core_web_sm

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import TransformersNlpEngine
from presidio_anonymizer import AnonymizerEngine

text = "<|AI_MSG|>Hi Mana , sorry for interrupting, I am Izzy Khan from Great North. I would like to ask if you are interested in subscribing to our life insurance, the monthly premium is only 45 ringgit.<|AI_MSG|>We also have a promotion for anyone who confirms subscribing today, we will give you a RM50 e-wallet. <|USER_MSG|>Sorry, I am not interested. I have subscribed to insurance. Thank you. <|AI_MSG|>Okay, thank you for responding. Can you also promote your friends or family members? <|USER_MSG|>My friend is an insurance agent 😅"
# Define which transformers model to use
model_config = [{"lang_code": "en", "model_name": {
    "spacy": "en_core_web_sm",  # use a small spaCy model for lemmas, tokens etc.
    "transformers": "dslim/bert-base-NER"
    }
}]

nlp_engine = TransformersNlpEngine(models=model_config)

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
anonymizer = AnonymizerEngine()

def anoynmize(text):
    # Call analyzer to get results
    results = analyzer.analyze(text=text, language='en')
    # Analyzer results are passed to the AnonymizerEngine for anonymization
    anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results)

    return anonymized_text


In [None]:
# %pip install transformers torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
import os
directory_path = '../data/chatlogs_cleaned/'
num_files = len([f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))])

all_special_ids = [0, 1, 2]
prefix = 'terjemah ke Melayu: '

tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-translation-v2')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/nanot5-base-malaysian-translation-v2')

def translate_text(text, max_length=512, prefix=prefix):
    input_ids = tokenizer.encode(f'{prefix}{text}{tokenizer.eos_token}', return_tensors='pt')
    outputs = model.generate(input_ids, max_length=max_length)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

for i in range(num_files-1):
    with open(f'../data/chatlogs_cleaned/chatlog{i}.json', 'r') as freader:
        # Load the JSON arrays
        messages_json = None
        chatlog = freader.read()        
        try:
            messages_json = json.loads(chatlog)
            messages = [message['message'].replace("\n","") for message in messages_json]
            
            # print(f"before processing\n{messages}")
            malay_translated_messages = [translate_text(chunk) for chunk in messages]
            # print(f"translated to malay\n{malay_translated_messages}")
            english_translated_messages = [translate_text(message, prefix="terjemah ke Inggeris: ") for message in malay_translated_messages]
            # print(f"translated to english\n{english_translated_messages}")
            anonymized_messages = [anoynmize(message).text for message in english_translated_messages]
            print(f"anonymized\n{anonymized_messages}")
            
            anonymized_string = ' <|AI_MSG|> '.join(message for message in anonymized_messages)
            anonymized_messages = anonymized_string.split('<|AI_MSG|>')
            for index, message in enumerate(messages_json):
                messages_json[index]['translated_message'] = anonymized_messages[index].strip()
                print(anonymized_messages[index].strip())
        except Exception as e:
            print(f"Error processing chatlog{i}.json: {e.with_traceback()}")
    with open(f'../data/chatlogs_translated/chatlog{i}.json', 'w') as fwriter:
        fwriter.write('[\n')
        for index, message in enumerate(messages_json):
            fwriter.write(json.dumps(messages_json[index]))
            fwriter.write('\n') if messages_json.index(message) == len(messages_json)-1 else fwriter.write(',\n')
        fwriter.write(']')

In [None]:
import ollama
with open(f'../data/chatlogs_translated/chatlog1.json', "r") as reader:
    chatlog = reader.read()      
    chatson = json.loads(chatlog)
    results = analyzer.analyze(text=chatson[0]['translated_message'], language='en')
    print(results)
    for msgs in chatson:
        res = ollama.generate(
            model="llama3.2",
            prompt= f"use <PERSON> to anonymize the names from the text: '{msgs['translated_message']}'",
            stream=False
        )
        # chatson[0]['translated_message'] =  anonymized_text = anonymizer.anonymize(text=chatson[0]['translated_message'], analyzer_results={})
        print(res['response'])