# CHATBOT PREPROCESSIGN

## DATA CLEANING

### Data Import and Combining

In [1]:
import re

file_path = "Preprocessing Dataset.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

print(f"Total raw lines: {len(raw_lines)}")

Total raw lines: 54333


In [2]:
def remove_invisible_chars(text):
    return re.sub(r"[\u200b\u200c\u200d\uFEFF\u200e\u200f]", "", text)

In [3]:
date_pattern = re.compile(
    r"^\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[AP]M\s-"
)

In [4]:
EDIT_PATTERN = re.compile(
    r"(?:<\s*)?this\s+message\s+was\s+edited(?:\s*>)?",
    re.IGNORECASE
)

def remove_invisible_chars(text):
    return re.sub(r"[\u200b\u200c\u200d\uFEFF\u200e\u200f]", "", text)

lines = []
for line in raw_lines:
    line = remove_invisible_chars(line)
    line = EDIT_PATTERN.sub("", line)
    lines.append(line.strip())

print("Edited-message markers removed (robust mode).")


Edited-message markers removed (robust mode).


In [5]:
combined_messages = []
current_message = ""

for line in lines:
    line = line.strip()

    if not line:
        continue  # skip empty lines

    # Check if line starts a new message
    if date_pattern.match(line):
        # Save previous message
        if current_message:
            combined_messages.append(current_message)

        current_message = line
    else:
        # Continuation of previous message
        current_message += " " + line

# Append last message
if current_message:
    combined_messages.append(current_message)

print(f"Combined messages: {len(combined_messages)}")

Combined messages: 47552


### Valid Chat Messages

In [6]:
DELETE_PATTERN = re.compile(
    r"(?:<\s*)?you\s+deleted\s+this\s+message(?:\s*>)?",
    re.IGNORECASE
)

YOU_DELETE_PATTERN = re.compile(
    r"(?:<\s*)?this\s+message\s+was\s+deleted(?:\s*>)?",
    re.IGNORECASE
)

In [7]:
def is_valid_chat_message(line):
    # Must contain timestamp separator
    if " - " not in line:
        return False
    
    if "omitted" in line:
        return False

    if "Waiting for this message" in line:
        return False
    
    if re.search(DELETE_PATTERN, line):
        return False
    
    if re.search(YOU_DELETE_PATTERN, line):
        return False
    
    # Split once on dash
    try:
        after_dash = line.split(" - ", 1)[1]
    except IndexError:
        return False

    # Must contain sender separator
    if ":" not in after_dash:
        return False

    return True

In [8]:
cleaned_messages = []

for msg in combined_messages:
    if is_valid_chat_message(msg):
        cleaned_messages.append(msg)

print(f"Before cleaning: {len(combined_messages)}")
print(f"After cleaning:  {len(cleaned_messages)}")

Before cleaning: 47552
After cleaning:  39338


### TimeStamp Removal

In [9]:
def remove_timestamp(line):
    """
    Removes WhatsApp date & time prefix
    """
    if " - " in line:
        return line.split(" - ", 1)[1]
    return line

In [10]:
removed_timestamp_messages = []

for msg in cleaned_messages:
    cleaned = remove_timestamp(msg)
    removed_timestamp_messages.append(cleaned)

print(len(removed_timestamp_messages)," messages' timestamp removed")

39338  messages' timestamp removed


### Combining User's Multiple Message

In [11]:
def parse_sender_text(line):
    """
    Splits 'Sender: message'
    """
    try:
        sender, text = line.split(":", 1)
        return sender.strip(), text.strip()
    except ValueError:
        return None, None

In [12]:
combined_turns = []

prev_sender = None
prev_text = ""

for line in removed_timestamp_messages:  # your current cleaned list
    sender, text = parse_sender_text(line)

    if sender is None:
        continue

    # Same sender ‚Üí append with ...
    if sender == prev_sender:
        prev_text += " ... " + text
    else:
        # Save previous turn
        if prev_sender is not None:
            combined_turns.append({
                "sender": prev_sender,
                "text": prev_text
            })

        # Start new turn
        prev_sender = sender
        prev_text = text

# Append last turn
if prev_sender is not None:
    combined_turns.append({
        "sender": prev_sender,
        "text": prev_text
    })


In [13]:
with open("last_final_cleaned_chat.txt", "w", encoding="utf-8") as f:
    for t in combined_turns:
        f.write(f"{t['sender']}: {t['text']}\n")

print("Length of Final Cleaned Chat:",len(combined_turns))

Length of Final Cleaned Chat: 21262


## Normalization

In [1]:
import re

file_path = "cleaned_chat.txt"

with open(file_path, "r", encoding="utf-8") as f:
    cleaned_lines = f.readlines()

print(f"Total raw lines: {len(cleaned_lines)}")

Total raw lines: 21260


In [2]:
ready_to_normalize_lines = []
for line in cleaned_lines:
    new_line = line.rstrip("\n")
    ready_to_normalize_lines.append(new_line)

print(len(ready_to_normalize_lines))

21260


In [3]:
ready_lines = []

for line in ready_to_normalize_lines:
    clean_line = line.split(":",1)[1].strip()
    ready_lines.append(clean_line)

print(len(ready_lines))

21260


In [4]:
print(ready_lines[:5])

['Congratulations üéâ', 'ü•π ... ü•∫üòò ... Hafta hogya hai kuch ni parha ... Aj inshallh parhon ga', 'Inshallah ‚ù§Ô∏è', 'Check ker ye pic hd may send hoti hai?', 'Ni ... Oye']


In [5]:
system_content = "You are a Pakistani Boy. You speak Roman English(Urdu written in English letters). You explain at bit much. You talk too much. But try not to reply tooo long(keep them under 30 words). Keep the tone."

In [6]:
formatted_data = []
user = ""
assistant = ""

for index,line in enumerate(ready_lines):
    if index % 2 == 0:
        user = str(line)

    else:
        assistant = str(line)
        full_message = {"message": [{"role": "system", "content": system_content}, {"role": "user", "content": user}, {"role": "assistant", "content": assistant}]}
        formatted_data.append(full_message)

print("No. of Messages = ", len(formatted_data))
print (formatted_data[:1])

No. of Messages =  10630
[{'message': [{'role': 'system', 'content': 'You are a Pakistani Boy. You speak Roman English(Urdu written in English letters). You explain at bit much. You talk too much. But try not to reply tooo long(keep them under 30 words). Keep the tone.'}, {'role': 'user', 'content': 'Congratulations üéâ'}, {'role': 'assistant', 'content': 'ü•π ... ü•∫üòò ... Hafta hogya hai kuch ni parha ... Aj inshallh parhon ga'}]}]


In [10]:
formatted_data = []
message = ""

for index,line in enumerate(ready_lines):
    if index % 2 == 0:
        message = "<s>[INST] <<SYS>> " + system_content + " <</SYS>> User: " + str(line)

    else:
        message += " [/INST] Assistant: " + str(line) + "</s>"
        full_message = {"text": message}
        formatted_data.append(full_message)
        message = ""
        

print("No. of Messages = ", len(formatted_data))
print (formatted_data[:2])

No. of Messages =  10630
[{'text': '<s>[INST] <<SYS>> You are a Pakistani Boy. You speak Roman English(Urdu written in English letters). You explain at bit much. You talk too much. But try not to reply tooo long(keep them under 30 words). Keep the tone. <</SYS>> User: Congratulations üéâ [/INST] Assistant: ü•π ... ü•∫üòò ... Hafta hogya hai kuch ni parha ... Aj inshallh parhon ga</s>'}, {'text': '<s>[INST] <<SYS>> You are a Pakistani Boy. You speak Roman English(Urdu written in English letters). You explain at bit much. You talk too much. But try not to reply tooo long(keep them under 30 words). Keep the tone. <</SYS>> User: Inshallah ‚ù§Ô∏è [/INST] Assistant: Check ker ye pic hd may send hoti hai?</s>'}]


In [7]:
import json

with open("dataset2.jsonl", "w", encoding="utf-8") as f:
    for item in formatted_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [None]:
# formatted_data = [
#     {
#         "messages": [
#             {"role": "system", "content": "You are a helpful AI."},
#             {"role": "user", "content": "Explain PPO simply."},
#             {"role": "assistant", "content": "PPO is a policy gradient method that..."}
#         ]
#     },
#     {
#         "messages": [
#             {"role": "system", "content": "You are a helpful AI."},
#             {"role": "user", "content": "What is Q-learning?"},
#             {"role": "assistant", "content": "Q-learning is a value-based RL algorithm..."}
#         ]
#     }
# ]


# {
#   "text": "<s>[INST] <<SYS>> You are a Pakistani Boy. You speak Roman English. You explain a bit much. Keep replies under 30 words. <</SYS>> User: Congratulations üéâ [/INST] Assistant: ü•πü•∫üòò Hafta hogya hai kuch ni parha, aaj Inshallah parhon ga</s>"
# }
