## Read the file

In [4]:
file_path = "./DATA/WhatsApp Chat with Ma-Tabot 💞💗/WhatsApp Chat with Ma-Tabot 💞💗.txt"
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

len(lines)

5048

## Clean the conversation

In [5]:
import re

encryption_message = "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more."
media_pattern = "<Media omitted>"
email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
edited_message = "<This message was edited>"
deleted_message = "You deleted this message"
null_message = "null"
created_group_message = "created group"
added_you_to_group_message = "added you"
tagging_pattern = r'@[\w]+'


filtered_lines = []
for line in lines:
    if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
    ):
        line = line.replace(edited_message, "").strip()
        line = re.sub(tagging_pattern, "", line).strip()
        filtered_lines.append(line)

# More flexible regex: handles 12/24-hour time and optional AM/PM
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?:\u202f?[APMapm]{2})?) - (.*?): (.*?)(?=\n\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2})'

content = '\n'.join(filtered_lines)
messages = re.findall(pattern, content, re.DOTALL)

print("Sample cleaned content:")
print(content[:500])  # Show the first 500 characters

print("\nMatches found:")
messages = re.findall(pattern, content, re.DOTALL)
print(len(messages))


lines_removed = len(lines) - len(filtered_lines)
print(f"Lines removed: {lines_removed}")

Sample cleaned content:
12/19/24, 2:16 PM - Ma-Tabot 💞💗: Good afternoon boss, how are you doing
12/19/24, 2:17 PM - Ma-Tabot 💞💗: How's everything been going?
12/19/24, 6:29 PM - Tabot Charles Bessong II💞: Good evening mom am good and you?
12/19/24, 6:30 PM - Tabot Charles Bessong II💞: Better and on your end ?
12/20/24, 8:01 AM - Ma-Tabot 💞💗: I'm good thanks
12/20/24, 8:01 AM - Ma-Tabot 💞💗: Just there but okay
12/20/24, 8:06 AM - Tabot Charles Bessong II💞: Good morning
12/20/24, 8:06 AM - Tabot Charles Bessong II💞: Good

Matches found:
4374
Lines removed: 299


## Create the dataset

### 1. Group messages by sender

If a conversation is structured as follows:  

```
User 1: Hey!  
User 1: How are you?  
User 2: I am fine  
User 2: And you?  
User 1: Good.  
```

We want to transform it into:  

```
User 1: Hey!\nHow are you? 
User 2: I am fine\nAnd you?  
User 1: Good  
```

In [8]:
grouped_messages = []

for _, sender, message in messages:
    if grouped_messages and grouped_messages[-1]["sender"] == sender:
        grouped_messages[-1]["message"] += "\n" + message
    else:
        grouped_messages.append({
            "sender": sender,
            "message": message
        })

print(grouped_messages)
print(len(grouped_messages))

[{'sender': 'Ma-Tabot 💞💗', 'message': "Good afternoon boss, how are you doing\nHow's everything been going?"}, {'sender': 'Tabot Charles Bessong II💞', 'message': 'Good evening mom am good and you?\nBetter and on your end ?'}, {'sender': 'Ma-Tabot 💞💗', 'message': "I'm good thanks\nJust there but okay"}, {'sender': 'Tabot Charles Bessong II💞', 'message': 'Good morning\nGood to know'}, {'sender': 'Ma-Tabot 💞💗', 'message': 'Good morning'}, {'sender': 'Tabot Charles Bessong II💞', 'message': 'Guess you are almost done with your project'}, {'sender': 'Ma-Tabot 💞💗', 'message': 'Well halfway done with everything including the report'}, {'sender': 'Tabot Charles Bessong II💞', 'message': '💪🏻\nWhen will you be given break ?'}, {'sender': 'Ma-Tabot 💞💗', 'message': 'Today 🫠\nDo you have electricity?'}, {'sender': 'Tabot Charles Bessong II💞', 'message': 'Nope'}, {'sender': 'Ma-Tabot 💞💗', 'message': 'Oh no, mine just came\nHow are you doing, I really miss you boss'}, {'sender': 'Tabot Charles Bessong 

### 2. Include special tokens

Each message follows this format:  
```
<|startoftext|>Sender<|separator|>Message<|endoftext|>
```

In [10]:
# Define special tokens
start_of_text_token = "<|startoftext|>"
end_of_text_token = "<|endoftext|>"
separator_token = "<|separator|>"

fine_tuning_data = []

for message in grouped_messages:
    sender = message["sender"]
    message_text = message["message"]
    input_sequence = f"{start_of_text_token}{sender}{separator_token}{message_text}{end_of_text_token}"
    fine_tuning_data.append(input_sequence)

print(len(fine_tuning_data))
print(fine_tuning_data[:4])

2928
["<|startoftext|>Ma-Tabot 💞💗<|separator|>Good afternoon boss, how are you doing\nHow's everything been going?<|endoftext|>", '<|startoftext|>Tabot Charles Bessong II💞<|separator|>Good evening mom am good and you?\nBetter and on your end ?<|endoftext|>', "<|startoftext|>Ma-Tabot 💞💗<|separator|>I'm good thanks\nJust there but okay<|endoftext|>", '<|startoftext|>Tabot Charles Bessong II💞<|separator|>Good morning\nGood to know<|endoftext|>']


### 3. Save the data

In [11]:
import json

save_path = "./DATA/fine_tuning.json"
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(fine_tuning_data, f, ensure_ascii=False, indent=4)