## Reading files


We will read the `.txt` files line by line and apply these filters:

1. **Remove lines containing a WhatsApp encryption notice**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.`
    - ✅ **After:** (Removed)

2. **Remove lines with `<Media omitted>`**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: <Media omitted>`
    - ✅ **After:** (Removed)

3. **Remove lines containing email addresses**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: example@gmail.com`
    - ✅ **After:** (Removed)

4. **Remove lines containing links**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: https://www.example.com/`
    - ✅ **After:** (Removed)

5. **Replace `<This message was edited>` with an empty string**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: hey, how are you? <This message was edited>`
    - ✅ **After:** `dd/mm/yyyy, hh:mm - Person: hey, how are you?`

6. **Remove lines with the text `You deleted this message`**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: You deleted this message`
    - ✅ **After:** (Removed)

7. **Remove lines with the text `null`**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: null`
    - ✅ **After:** (Removed)

8. **Remove lines with the text `created group`**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person created group "group name"`
    - ✅ **After:** (Removed)

9. **Remove lines with the text `added you`**

    - ❌ **Before:** `dd/mm/yyyy, hh:mm - Person added you`
    - ✅ **After:** (Removed)

10. **Replace tagging (`@person`) with an empty string**

-   ❌ **Before:** `dd/mm/yyyy, hh:mm - Person: @person are you coming?`
-   ✅ **After:** `dd/mm/yyyy, hh:mm - Person: are you coming?`


In [None]:
import re
import pandas as pd


def read_whatsapp_chat(file_path: str) -> pd.DataFrame:
    # Define filtering patterns
    encryption_message = "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more."
    media_pattern = "<Media omitted>"
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    edited_message = "<This message was edited>"
    deleted_message = "You deleted this message"
    null_message = "null"
    created_group_message = "created group"
    added_you_to_group_message = "added you"
    tagging_pattern = r'@[\w]+'

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Apply filters to remove unwanted lines
    filtered_lines = []
    for line in lines:
        if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
        ):
            line = line.replace(edited_message, "").strip()
            line = re.sub(tagging_pattern, "", line).strip()
            filtered_lines.append(line)

    # Regular expression to match WhatsApp message format
    pattern = r'(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - (.*?): (.*?)(?=\n\d{2}/\d{2}/\d{4}, \d{2}:\d{2} -|$)'
    content = '\n'.join(filtered_lines)
    messages = re.findall(pattern, content, re.DOTALL)

    df = pd.DataFrame(messages, columns=['timestamp', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y, %H:%M')
    return df

In [None]:
import os
import re
import json
import pandas as pd

def read_whatsapp_chat(file_path: str) -> pd.DataFrame:
    """
    Read and process a WhatsApp chat text file, filtering out unwanted messages.

    Args:
        file_path (str): Path to the WhatsApp chat text file

    Returns:
        pd.DataFrame: Processed DataFrame with filtered messages
    """
    # Define filtering patterns
    encryption_message = "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more."
    media_pattern = "<Media omitted>"
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    edited_message = "<This message was edited>"
    deleted_message = "You deleted this message"
    null_message = "null"
    created_group_message = "created group"
    added_you_to_group_message = "added you"
    tagging_pattern = r'@[\w]+'

    # Read the file
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return pd.DataFrame()

    # Apply filters to remove unwanted lines
    filtered_lines = []
    for line in lines:
        if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.strip().split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
        ):
            line = line.replace(edited_message, "").strip()
            line = re.sub(tagging_pattern, "", line).strip()
            filtered_lines.append(line)

    # Regular expression to match WhatsApp message format
    pattern = r'(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - (.*?): (.*?)(?=\n\d{2}/\d{2}/\d{4}, \d{2}:\d{2} -|$)'
    content = '\n'.join(filtered_lines)
    messages = re.findall(pattern, content, re.DOTALL)

    # Create DataFrame
    if messages:
        df = pd.DataFrame(messages, columns=['timestamp', 'sender', 'message'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y, %H:%M')

        # Remove empty messages
        df = df[df['message'].str.strip() != '']

        return df

    return pd.DataFrame()

def process_chat_files(folder_path: str, answer_sender: str) -> list:
    """
    Process multiple WhatsApp chat files and extract Q&A pairs.

    Args:
        folder_path (str): Path to the folder containing chat text files
        answer_sender (str): Name of the sender who provides answers

    Returns:
        list: List of dictionaries containing questions and answers
    """
    # List all text files in the folder
    chat_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

    # Collect all processed DataFrames
    all_chats = []

    for file in chat_files:
        file_path = os.path.join(folder_path, file)
        chat_df = read_whatsapp_chat(file_path)

        if not chat_df.empty:
            all_chats.append(chat_df)

    # Combine all chat DataFrames
    if all_chats:
        combined_df = pd.concat(all_chats, ignore_index=True)

        # Sort by timestamp
        combined_df = combined_df.sort_values('timestamp')

        # Separate potential questions (all non-answer senders) and answers
        answers_df = combined_df[combined_df['sender'] == answer_sender]

        # Prepare result list
        qa_pairs = []

        # Find Q&A pairs
        for answer in answers_df.itertuples():
            # Find the most recent message from a different sender before this answer
            question_candidates = combined_df[
                (combined_df['timestamp'] < answer.timestamp) &
                (combined_df['sender'] != answer_sender)
            ]

            if not question_candidates.empty:
                # Take the most recent question
                question = question_candidates.iloc[-1]

                # Only add if both question and answer are non-empty
                if (str(question.message).strip() and
                    str(answer.message).strip()):
                    qa_pairs.append({
                        "question": question.message,
                        "question_sender": question.sender,
                        "answer": answer.message,
                        "timestamp": answer.timestamp.strftime('%Y-%m-%d %H:%M:%S')
                    })

        return qa_pairs

    return []

def main(folder_path: str, answer_sender: str, output_file: str = 'qa_pairs.json'):
    """
    Main function to process chat files and save results to JSON.

    Args:
        folder_path (str): Path to the folder containing chat text files
        answer_sender (str): Name of the sender who provides answers
        output_file (str, optional): Path to save the output JSON file
    """
    # Process chat files
    qa_pairs = process_chat_files(folder_path, answer_sender)

    # Save to JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

    print(f"Processed {len(qa_pairs)} Q&A pairs. Saved to {output_file}")
    print("First few pairs:")
    print(json.dumps(qa_pairs[:3], ensure_ascii=False, indent=2))

# Example usage
if __name__ == "__main__":
    folder_path = './WhatsApp/'
    answer_sender = 'Umyal Dixit'

    # Process and save to JSON
    main(folder_path, answer_sender)

The `all_chats` dictionary holds the content of each file as a dataframe with three columns: `timestamp`, `sender`, and `message`.


In [None]:
from pathlib import Path

all_chats = {}
data_directory = Path("WhatsApp")
for file in data_directory.glob('*.txt'):
    file_name = file.stem
    all_chats[file_name] = read_whatsapp_chat(file)

## Text sequence


The text should be merged into a single sequence to prepare it for the next step, where the BPE algorithm will be applied and the text will be encoded.


In [None]:
text_sequence = ""
for file_name in all_chats.keys():
    text_sequence += " ".join(all_chats[file_name]['message'].values)

len(text_sequence)

In [None]:
with open("./output/combined_text.txt", "w") as f:
    f.write(text_sequence)

In [None]:
import re
import pandas as pd
from pathlib import Path

def read_whatsapp_chat(file_path: str) -> pd.DataFrame:
    with open(str(file_path), 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Remove system messages (media, encryption, deleted messages)
    filtered_lines = []
    for line in lines:
        if not any(exclude in line for exclude in [
            "Messages and calls are end-to-end encrypted",
            "<Media omitted>",
            "This message was deleted"
        ]):
            filtered_lines.append(line)

    # Join messages as a single text
    content = '\n'.join(filtered_lines)

    # Updated regex pattern to correctly extract messages
    pattern = r'\[(\d{2}/\d{2}/\d{2,4}), (\d{1,2}:\d{2}:\d{2}\s?[APM\u202F]*)\] (.*?): (.*?)(?=\n\[\d{2}/\d{2}/\d{2,4}, \d{1,2}:\d{2}:\d{2}\s?[APM\u202F]*\]|\Z)'

    messages = re.findall(pattern, content, re.DOTALL)

    # Convert to DataFrame
    df = pd.DataFrame(messages, columns=['date', 'time', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df['date'] + " " + df['time'], format='%d/%m/%y %I:%M:%S %p')
    df.drop(columns=['date', 'time'], inplace=True)

    return df

# Read all WhatsApp chats
all_chats = {}
data_directory = Path("WhatsApp")

if not data_directory.exists():
    print("Error: WhatsApp directory does not exist!")
elif not list(data_directory.glob("*.txt")):
    print("Error: No .txt files found in WhatsApp directory!")
else:
    for file in data_directory.glob('*.txt'):
        file_name = file.stem
        all_chats[file_name] = read_whatsapp_chat(file)

# Merge all messages
text_sequence = " ".join(
    msg for chat in all_chats.values() for msg in chat['message'].values
)

# Save output
output_directory = Path("./")
output_directory.mkdir(parents=True, exist_ok=True)

with open(output_directory / "combined_text.txt", "w", encoding="utf-8") as f:
    f.write(text_sequence)

print(f"Combined text saved to {output_directory}/combined_text.txt")


In [None]:
import re
import json
import pandas as pd
from pathlib import Path

# List of names that should be considered as "assistant"
ASSISTANT_NAMES = ["Aditya Pandey", "Umyal Dixit"]

def read_whatsapp_chat(file_path: Path) -> pd.DataFrame:
    """Reads a WhatsApp chat file and converts it into a structured DataFrame."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Remove system messages (e.g., media omitted, encryption notices)
    filtered_lines = []
    for line in lines:
        if not any(exclude in line for exclude in [
            "Messages and calls are end-to-end encrypted",
            "<Media omitted>",
            "This message was deleted",
            "<This message was edited>"
        ]):
            filtered_lines.append(line.strip())

    # Combine messages into a single string
    content = '\n'.join(filtered_lines)

    # Updated regex pattern to handle different timestamp formats
    pattern = r'\[(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}:\d{2}[\s\u202F]?(?:AM|PM)?)\] (.*?): (.*?)(?=\n\[\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2}[\s\u202F]?(?:AM|PM)?\]|\Z)'

    messages = re.findall(pattern, content, re.DOTALL)

    # Debugging: Print extracted messages
    if not messages:
        print(f"Warning: No messages extracted from {file_path}")

    # Convert to DataFrame
    df = pd.DataFrame(messages, columns=['date', 'time', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df['date'] + " " + df['time'], errors='coerce')
    df.drop(columns=['date', 'time'], inplace=True)

    return df

# Read all WhatsApp chat files
data_directory = Path("WhatsApp")
all_chats = {}

if not data_directory.exists():
    print("Error: WhatsApp directory does not exist!")
elif not list(data_directory.glob("*.txt")):
    print("Error: No .txt files found in WhatsApp directory!")
else:
    for file in data_directory.glob('*.txt'):
        file_name = file.stem
        df = read_whatsapp_chat(file)
        if not df.empty:
            all_chats[file_name] = df
        else:
            print(f"Warning: No data extracted from {file_name}")

# Convert to Ollama training JSON format
ollama_data = []

for file_name, df in all_chats.items():
    messages = []

    for _, row in df.iterrows():
        role = "assistant" if row["sender"] in ASSISTANT_NAMES else "user"
        messages.append({"role": role, "content": row["message"]})

    if messages:
        ollama_data.append({
            "system": "You are a helpful AI trained on WhatsApp conversations.",
            "messages": messages
        })

# Ensure output directory exists
output_directory = Path("./output")
output_directory.mkdir(parents=True, exist_ok=True)

# Save JSON file
output_file = output_directory / "ollama_training.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(ollama_data, f, indent=4, ensure_ascii=False)

print(f"✅ Ollama training data saved to {output_file}")


In [None]:
import re
import pandas as pd
import os
import json

# Folder containing WhatsApp chat files
input_folder = "WhatsApp/"
output_folder = "output/"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define assistant user
assistant_user = "Umyal Dixit"

# WhatsApp chat regex pattern
pattern = r"(\d{2}/\d{2}/\d{2}), (\d{1,2}:\d{2} ?[ap]m) - (.*?): (.*)"

# Store all messages
all_data = []

# Process all .txt files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_folder, filename)
        print(f"🔄 Processing {filename}...")

        with open(file_path, "r", encoding="utf-8") as file:
            lines = file.readlines()

        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, time, user, message = match.groups()
                role = "assistant" if user == assistant_user else "user"
                all_data.append({"timestamp": f"{date} {time}", "user": user, "message": message, "role": role})

# Convert to JSONL format
jsonl_output_file = os.path.join(output_folder, "ollama_training.jsonl")

with open(jsonl_output_file, "w", encoding="utf-8") as f:
    for entry in all_data:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Training data saved to {jsonl_output_file}")


In [None]:
import os
import re
import pandas as pd
from datasets import Dataset

# Define folder containing WhatsApp chat files
whatsapp_folder = "WhatsApp"

# WhatsApp message regex pattern (handles both AM/PM and 24-hour formats)
whatsapp_pattern = r"(\d{2}/\d{2}/\d{2,4}), (\d{1,2}:\d{2}\s?(?:AM|PM|am|pm)?) - ([^:]+): (.+)"

# Store extracted data
data = []

# Loop through all .txt files in the folder
for filename in os.listdir(whatsapp_folder):
    file_path = os.path.join(whatsapp_folder, filename)

    if filename.endswith(".txt"):  # Process only text files
        with open(file_path, "r", encoding="utf-8") as file:
            lines = file.readlines()

        # Store the last user message to match with the assistant's reply
        last_user_message = None
        last_user_name = None

        for line in lines:
            match = re.match(whatsapp_pattern, line)
            if match:
                date, time, sender, message = match.groups()

                # Skip system messages
                if "Messages and calls are end-to-end encrypted" in message:
                    continue

                # Assign roles
                role = "assistant" if sender.strip() == "Umyal Dixit" else "user"

                if role == "user":
                    last_user_message = message.strip()
                    last_user_name = sender.strip()
                elif role == "assistant" and last_user_message:
                    # Create Question-Answer pair
                    data.append({
                        "question": f"{last_user_name}: {last_user_message}",
                        "answer": f"Umyal Dixit: {message.strip()}"
                    })
                    last_user_message = None  # Reset for next pair

# Convert extracted data to a DataFrame
df = pd.DataFrame(data)

# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Format for training
dataset = dataset.map(lambda x: {"text": f"### Question: {x['question']}\n### Answer: {x['answer']}"})

# Save formatted dataset
df.to_csv("whatsapp_dataset.csv", index=False)
dataset.to_json("whatsapp_dataset.json", orient="records")

print("✅ WhatsApp chat data formatted and saved!")


In [None]:
import os
import torch
import gc
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Step 1: Load and Format WhatsApp Dataset
df = pd.read_csv('whatsapp_dataset.csv')  # Ensure the dataset is in CSV format
dataset = Dataset.from_pandas(df)
dataset = dataset.map(lambda x: {"text": f"### Question: {x['question']}\n### Answer: {x['answer']}"})

# Step 2: Initialize Dolphin 3 Model & Tokenizer
model_name = "cognitivecomputations/dolphin-2.6-mistral"  # Replace with Dolphin-3 when available
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# LoRA Configuration for Efficient Fine-Tuning
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    task_type="CAUSAL_LM"
)

# Load Base Model with Quantization (for lower VRAM usage)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_4bit=True)
)

# Step 3: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./dolphin3_finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Change to "wandb" if using Weights & Biases
)

# Step 4: Fine-Tune the Model
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_args
)

trainer.train()

# Step 5: Save Fine-Tuned Model Locally
trainer.model.save_pretrained('finetuned_dolphin3')

# Step 6: Evaluate & Inference
pipe = pipeline('text-generation', model=trainer.model, tokenizer=tokenizer)
output = pipe("### Question: What is your name?")[0]['generated_text']
print(output)

# Step 7: Free Up Memory
del base_model, trainer, pipe
gc.collect()

# Step 8: Merge LoRA with Base Model & Upload to Hugging Face Hub
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(base_model, 'finetuned_dolphin3')
model = model.merge_and_unload()

# Step 9: Push Fine-Tuned Model to Hugging Face Hub
# model.push_to_hub("your_huggingface_username/finetuned_dolphin3")
# tokenizer.push_to_hub("your_huggingface_username/finetuned_dolphin3")

print("✅ Fine-tuned Dolphin 3 model saved & uploaded!")


In [None]:
import re
import unicodedata
import json
import pandas as pd


def clean_text(text):
    """Removes hidden Unicode characters that may affect parsing."""
    return "".join(ch for ch in text if not unicodedata.category(ch).startswith("C"))


def read_whatsapp_chat(file_path: str) -> pd.DataFrame:
    """
    Reads a WhatsApp chat export file, processes it, and returns a structured DataFrame.
    """
    # Filtering patterns to remove unwanted messages
    encryption_message = "Messages and calls are end-to-end encrypted"
    media_pattern = "<Media omitted>"
    system_messages = [
        "created group", "added you", "removed", "left", "changed the subject",
        "changed this group's icon", "changed the group description",
        "deleted this message", "You deleted this message"
    ]
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tagging_pattern = r'@[\w]+'

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Clean hidden characters
    cleaned_lines = [clean_text(line).strip() for line in lines]

    # Apply filters to remove unnecessary messages
    filtered_lines = []
    for line in cleaned_lines:
        if (
            encryption_message not in line
            and not any(msg in line for msg in system_messages)
            and media_pattern not in line
            and not re.search(email_pattern, line)
            and not re.search(url_pattern, line)
        ):
            line = re.sub(tagging_pattern, "", line).strip()  # Remove @mentions
            filtered_lines.append(line)

    # Reconstruct the content for regex processing
    content = "\n".join(filtered_lines)

    # WhatsApp message format regex
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}\s?[APMapm]*) - (.*?): (.*)'

    messages = re.findall(pattern, content)
    if not messages:
        print("❌ No messages found. Check your file format.")
        return pd.DataFrame()

    # Convert extracted data into a DataFrame
    df = pd.DataFrame(messages, columns=['date', 'time', 'sender', 'message'])

    # Convert date and time to proper datetime format
    df['timestamp'] = pd.to_datetime(df['date'] + " " + df['time'], errors='coerce')

    # Drop old columns and reorder
    df = df[['timestamp', 'sender', 'message']].dropna().sort_values(by='timestamp')

    return df


def convert_to_json_format(df: pd.DataFrame, output_file: str):
    """
    Converts the parsed DataFrame into the requested JSON format.
    """
    messages = []

    for i, row in df.iterrows():
        if row['sender'].lower() == "umyal dixit":
            # If Umyal Dixit is the sender, make it an output message
            messages.append({
                "instruction": df.iloc[i-1]['message'] if i > 0 else "",
                "input": "",
                "output": row['message']
            })
        else:
            # Otherwise, it's an instruction
            messages.append({
                "instruction": row['message'],
                "input": "",
                "output": ""
            })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(messages, f, indent=4, ensure_ascii=False)

    print(f"✅ JSON file saved at: {output_file}")


# === Run the script ===
input_file = "./WhatsApp/ad.txt"  # Replace with your actual file
output_file = "formatted_chat.json"

df = read_whatsapp_chat(input_file)

if not df.empty:
    convert_to_json_format(df, output_file)


In [None]:
import re
import unicodedata
import json
import pandas as pd


def clean_text(text):
    """Removes hidden Unicode characters that may affect parsing."""
    return "".join(ch for ch in text if not unicodedata.category(ch).startswith("C"))


def read_whatsapp_chat(file_path: str) -> pd.DataFrame:
    """
    Reads a WhatsApp chat export file, processes it, and returns a structured DataFrame.
    """
    encryption_message = "Messages and calls are end-to-end encrypted"
    media_pattern = "<Media omitted>"
    system_messages = [
        "created group", "added you", "removed", "left", "changed the subject",
        "changed this group's icon", "changed the group description",
        "deleted this message", "You deleted this message"
    ]
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tagging_pattern = r'@[\w]+'

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = [clean_text(line).strip() for line in lines]

    filtered_lines = []
    for line in cleaned_lines:
        if (
            encryption_message not in line
            and not any(msg in line for msg in system_messages)
            and media_pattern not in line
            and not re.search(email_pattern, line)
            and not re.search(url_pattern, line)
        ):
            line = re.sub(tagging_pattern, "", line).strip()  # Remove @mentions
            filtered_lines.append(line)

    content = "\n".join(filtered_lines)

    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}\s?[APMapm]*) - (.*?): (.*)'

    messages = re.findall(pattern, content)
    if not messages:
        print("❌ No messages found. Check your file format.")
        return pd.DataFrame()

    df = pd.DataFrame(messages, columns=['date', 'time', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df['date'] + " " + df['time'], errors='coerce')
    df = df[['timestamp', 'sender', 'message']].dropna().sort_values(by='timestamp')

    return df


def convert_to_json_format(df: pd.DataFrame, output_file: str):
    """
    Converts the parsed DataFrame into a structured JSON format.
    """
    messages = []
    current_instruction = []
    current_response = []

    for i, row in df.iterrows():
        sender = row['sender'].strip()
        message = row['message'].strip()

        if sender.lower() == "umyal dixit":
            if current_instruction:
                messages.append({
                    "instruction": " ".join(current_instruction),
                    "input": "",
                    "output": ""
                })
                current_instruction = []

            current_response.append(message)
        else:
            if current_response:
                messages.append({
                    "instruction": "",
                    "input": " ".join(current_instruction) if current_instruction else "No input",
                    "output": " ".join(current_response)
                })
                current_instruction = []
                current_response = []

            current_instruction.append(message)

    if current_response:
        messages.append({
            "instruction": "",
            "input": " ".join(current_instruction) if current_instruction else "No input",
            "output": " ".join(current_response)
        })
    elif current_instruction:
        messages.append({
            "instruction": " ".join(current_instruction),
            "input": "",
            "output": ""
        })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(messages, f, indent=4, ensure_ascii=False)

    print(f"✅ JSON file saved at: {output_file}")


# === Run the script ===
input_file = "./WhatsApp/ad.txt"  # Replace with your actual file
output_file = "formatted_chat.json"

df = read_whatsapp_chat(input_file)

if not df.empty:
    convert_to_json_format(df, output_file)


In [8]:
import os
import re
import json
import pandas as pd
import chardet
from datetime import datetime

def detect_and_read_file(file_path):
    """
    Robustly read a file by trying multiple encoding strategies.
    
    Args:
        file_path (str): Path to the file
    
    Returns:
        str: Decoded file content
    """
    # List of encodings to try
    encodings_to_try = [
        'utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 
        'utf-16', 'ascii', 'big5', 'shift_jis'
    ]
    
    # First, try chardet to detect encoding
    try:
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            detected_encoding = chardet.detect(raw_data)['encoding']
        
        # Add detected encoding to the top of our try list
        if detected_encoding:
            encodings_to_try.insert(0, detected_encoding)
    except Exception as e:
        print(f"Chardet detection failed for {file_path}: {e}")
    
    # Try different encodings
    for encoding in encodings_to_try:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error reading {file_path} with {encoding} encoding: {e}")
    
    # If all else fails, try reading with error handling
    try:
        with open(file_path, 'r', encoding='latin-1') as file:
            return file.read()
    except Exception as e:
        print(f"Completely failed to read {file_path}: {e}")
        return ""

def extract_whatsapp_messages(content):
    """
    Extract WhatsApp messages from text content with enhanced robustness.
    
    Args:
        content (str): Text content to parse
    
    Returns:
        list: List of tuples (timestamp, sender, message)
    """
    # Multiple regex patterns to catch different WhatsApp message formats
    patterns = [
        r'(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - (.*?): (.*?)(?=\n\d{2}/\d{2}/\d{4}, \d{2}:\d{2} -|$)',
        r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\s*[AP]M) - (.*?): (.*?)(?=\n\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\s*[AP]M -|$)'
    ]
    
    # Find messages using all patterns
    all_messages = []
    for pattern in patterns:
        messages = re.findall(pattern, content, re.DOTALL | re.MULTILINE | re.IGNORECASE)
        all_messages.extend(messages)
    
    # Filter and clean messages
    filtered_messages = []
    for msg in all_messages:
        timestamp, sender, message = msg
        
        # Skip unwanted messages
        unwanted_patterns = [
            "Messages and calls are end-to-end encrypted",
            "<Media omitted>",
            "created group",
            "added you",
            "@",
            re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'),
            re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        ]
        
        # Check if message should be skipped
        if not any(
            (isinstance(pattern, str) and pattern in message) or 
            (isinstance(pattern, re.Pattern) and pattern.search(message))
            for pattern in unwanted_patterns
        ):
            # Clean up the message
            message = re.sub(r'<.*?>', '', message).strip()
            
            if message:  # Only add non-empty messages
                filtered_messages.append((timestamp, sender, message))
    
    return filtered_messages

def parse_timestamp(timestamp_str):
    """
    Parse timestamp string with multiple format support.
    
    Args:
        timestamp_str (str): Timestamp string to parse
    
    Returns:
        datetime: Parsed datetime object
    """
    timestamp_formats = [
        '%d/%m/%Y, %H:%M',
        '%m/%d/%Y, %H:%M',
        '%d/%m/%y, %H:%M',
        '%m/%d/%y, %H:%M',
        '%d/%m/%Y, %I:%M %p',
        '%m/%d/%Y, %I:%M %p',
        '%d/%m/%y, %I:%M %p',
        '%m/%d/%y, %I:%M %p'
    ]
    
    for fmt in timestamp_formats:
        try:
            return datetime.strptime(timestamp_str, fmt)
        except ValueError:
            continue
    
    # If no format matches, return current time
    print(f"Could not parse timestamp: {timestamp_str}")
    return datetime.now()

def process_folder_files(folder_path, answer_sender):
    """
    Process all text files in a folder and extract Q&A pairs.
    
    Args:
        folder_path (str): Path to the folder containing text files
        answer_sender (str): Name of the sender who provides answers
    
    Returns:
        list: List of dictionaries with Q&A pairs
    """
    # List all text files in the folder
    text_files = [
        f for f in os.listdir(folder_path) 
        if f.lower().endswith(('.txt', '.log', '.csv', '.md'))
    ]
    
    # Collect all messages from all files
    all_messages = []
    
    for file in text_files:
        file_path = os.path.join(folder_path, file)
        
        # Read file content with robust encoding detection
        content = detect_and_read_file(file_path)
        
        # Extract WhatsApp messages
        file_messages = extract_whatsapp_messages(content)
        
        # Add file name as metadata and parse timestamp
        for msg in file_messages:
            timestamp, sender, message = msg
            parsed_timestamp = parse_timestamp(timestamp)
            all_messages.append((timestamp, parsed_timestamp, sender, message, file))
    
    # Convert to DataFrame for easier processing
    df = pd.DataFrame(
        all_messages, 
        columns=['original_timestamp', 'timestamp', 'sender', 'message', 'source_file']
    )
    
    # Sort by timestamp
    df = df.sort_values('timestamp')
    
    # Prepare Q&A pairs
    qa_pairs = []
    
    # Get answers from specified sender
    answers_df = df[df['sender'] == answer_sender]
    
    for answer in answers_df.itertuples():
        # Find the most recent message from a different sender before this answer
        question_candidates = df[
            (df['timestamp'] < answer.timestamp) & 
            (df['sender'] != answer_sender)
        ]
        
        if not question_candidates.empty:
            # Take the most recent question
            question = question_candidates.iloc[-1]
            
            # Only add if both question and answer are non-empty
            if (str(question.message).strip() and 
                str(answer.message).strip()):
                qa_pairs.append({
                    "question": question.message,
                    "answer": answer.message,
                })
    
    return qa_pairs

def main(folder_path, answer_sender, output_file='qa_pairs.json'):
    """
    Main function to process files and save Q&A pairs to JSON.
    
    Args:
        folder_path (str): Path to the folder containing text files
        answer_sender (str): Name of the sender who provides answers
        output_file (str, optional): Path to save the output JSON file
    """
    # Process chat files
    qa_pairs = process_folder_files(folder_path, answer_sender)
    
    # Save to JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
    
    print(f"Processed {len(qa_pairs)} Q&A pairs. Saved to {output_file}")
    print("First few pairs:")
    print(json.dumps(qa_pairs[:3], ensure_ascii=False, indent=2))

# Example usage
if __name__ == "__main__":
    folder_path = './WhatsApp'
    answer_sender = 'Umyal Dixit'
    
    # Process and save to JSON
    main(folder_path, answer_sender)

Processed 25965 Q&A pairs. Saved to qa_pairs.json
First few pairs:
[
  {
    "question": "Kya haal h",
    "answer": "Tu suna"
  },
  {
    "question": "Kya haal h",
    "answer": "Badhiya"
  },
  {
    "question": "Kkrh?",
    "answer": "Kuch nhi"
  }
]
