## Problem Statement

The challenge is to develop a chatbot that accurately replicates an individual’s unique chat style based on their WhatsApp messages.

In [None]:
import re
import os
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

## Data Acquisition

- **Source:** WhatsApp chat logs.
- **Key Features (Input):**
  - Message content
  - Timestamps
  - Sender info
  - Conversation context
- **Output Needed:** Responses that reflect the target person’s communication style.

In [None]:
datetime_formats = ["%d/%m/%y, %H:%M", "%m/%d/%y, %H:%M", "%d/%m/%y %H:%M:%S", "%d/%m/%Y %H:%M"]
stop_words = ['created group', 'business account', 'Messages and calls are end-to-end encrypted', 'is a contact', 'deleted this message', 'voice call', '未接语音通话', '消息和通话都进行端到端加密', '语音通话', '这条消息已被删除']
DIRECTORY = './data/'
FILES = [f for f in os.listdir(DIRECTORY) if f.endswith('.txt')]
print(FILES)

def get_datapoint(line: str):
    match = re.search(r'\[?(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}|\d{1,2}\/\d{1,2}\/\d{2}, \d{2}:\d{2})\]? (?:- )?(.*?): ?(.+)', line)
    if match is None:
      return line.strip(),
    date, author, message = match.groups()
    return date, author, message


def get_datetime_format(filename):
    formats = ["%d/%m/%y, %H:%M", "%m/%d/%y, %H:%M", "%d/%m/%y %H:%M:%S", "%d/%m/%Y %H:%M"]
    with open(f"{DIRECTORY}{filename}", encoding='utf-8') as fp:
        lines = fp.readlines()
        for line in lines:
            date = re.search(r'^\[?(.*?)(?:\]| -)', line)
            if date is None:
                continue
            for i, format in enumerate(formats):
                try:
                    datetime.strptime(date.group(1), format)
                except ValueError:
                    formats.pop(i)
            if len(formats) == 1:
              break
        return formats[0]


def load(file):
  res = []
  with open(f"{DIRECTORY}{file}", encoding="utf-8") as fp:
    lines = fp.readlines()
    counter = 0
    DATETIME_FORMAT = get_datetime_format(file)
    date, time, author = None, None, None
    message_buffer = []

    for line in lines[1:]:
      if any(i in line for i in stop_words):
        continue
      line = line.replace(u'\u200e', '')
      message_data = get_datapoint(line)

      if len(message_data) == 1:
        message_buffer.append(message_data[0])
      else:
        if len(message_buffer) > 0:
          if date is None: continue
          date_parsed = datetime.strptime(date, DATETIME_FORMAT)
          res.append([date_parsed, author, ' '.join(message_buffer)])
          counter += 1
        message_buffer.clear()
        date, author, message = get_datapoint(line)
        try:
          date_parsed = datetime.strptime(date, DATETIME_FORMAT)
          if date_parsed:
            message_buffer.append(message)
        except ValueError:
          pass

  print(f"Loaded {file} with {counter} datapoints")
  return res

In [None]:
data = []
for f in FILES:
  data.extend(load(f))
print(len(data), "datapoints in total.")

In [None]:
df = pd.DataFrame(data, columns=["Date", 'Author', 'Message'])
prev_datetime = df["Date"].shift(periods=1)
df["time_delta"] = (df["Date"] - prev_datetime).dt.total_seconds()
df = df.sort_values(by="Date").reset_index(drop=True)

authors = list(df.Author.unique())
authors

In [None]:
df.head()

## Exploratory Analysis

Here we analyze the number of messages per day

In [None]:
plt.figure(figsize=(20, 5))

grouped = df.groupby(df['Date'].dt.date).agg({'Message': 'count'}).reset_index()
sns.lineplot(data=grouped, x='Date', y='Message')
plt.fill_between(grouped['Date'], grouped['Message'], color='skyblue', alpha=0.5)

plt.xlabel('Date')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent per Day')
plt.ylim(0)
plt.grid(True)
plt.show()

In [None]:
resampled = df.set_index('Date').resample('D').count().fillna(0)
max_date = resampled['Message'].idxmax()
max_messages = resampled.loc[max_date, 'Message']
avg_messages = resampled['Message'].mean()

print(f"Most messages sent on {max_date.strftime('%Y-%m-%d')} at {max_messages} messages")
print(f"On average, {avg_messages} messages are sent per day")

In [None]:
df['Day'] = df['Date'].dt.day_name()

pivot = df.pivot_table(index='Day', values='Message', aggfunc='count')
pivot = pivot.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

plt.figure(figsize=(12, 5))
sns.heatmap(pivot, annot=True, fmt='d')
plt.title('Number of messages sent per day of the week')
plt.ylabel('Day of the week')
plt.xlabel('Number of messages')
plt.show()

In [None]:
df['Hour'] = df["Date"].dt.hour
pivot = df.pivot_table(index='Hour', values='Message', aggfunc='count')

plt.figure(figsize=(12, 10))
sns.heatmap(pivot, annot=True, fmt='d')
plt.title('Number of messages sent per hour of the day')
plt.ylabel('Hour of the day')
plt.xlabel('Number of messages')
plt.show()

## Data Cleaning

a) Removed specific text patterns (e.g., "<Media omitted>", URLs)

b) Filtered out excessively long messages (above 99th percentile)

c) Merged messages from the same author within short time frames

d) Standardized punctuation and removed repetitive characters

In [None]:
texts_to_remove = ['<Media omitted>', '<This message was edited>', 'https?://\S+', '13135550002']
df = df[~df['Message'].str.contains('|'.join(texts_to_remove), regex=True)]
df = df[df['Message'].str.strip().astype(bool)]
df.reset_index(drop=True, inplace=True)
df.shape

Remove excessively long messages

In [None]:
df['Word_Count'] = df['Message'].apply(lambda x: len(x.split()))
print(f"Removing messages with more than 50 words")
df = df[df['Word_Count'] <= 50]
df.shape

Join messages sent by the same author that are sent in the same time frame together, lengthening the responses.

In [None]:
merged_messages = []
current_author = None
current_message = []
current_datetime = None
current_word_count = 0

for _, row in df.iterrows():
    author = row['Author']
    message = row['Message']
    datetime = row['Date']
    word_count = row['Word_Count']

    if author == current_author and current_word_count + word_count <= 50:
        current_message.append(message)
        current_word_count += word_count
        current_datetime = min(current_datetime, datetime)
    else:
        if current_message:
            merged_messages.append({
                'Author': current_author,
                'Message': ' '.join(current_message),
                'Datetime': current_datetime
            })

        current_author = author
        current_message = [message]
        current_datetime = datetime
        current_word_count = word_count

if current_message:
    merged_messages.append({
        'Author': current_author,
        'Message': ' '.join(current_message),
        'Datetime': current_datetime
    })

df = pd.DataFrame(merged_messages)

In [None]:
df.head()

Here we look at the word count distribution of the new merged messages

In [None]:
df['Message'] = df['Message'].apply(lambda text: re.sub(r'([^\w\s])\1+', r'\1', text))
df['Message'] = df['Message'].str.replace(r'([!?.])\1+', r'\1', regex=True)

df['Word_Count'] = df['Message'].apply(lambda x: len(x.split()))
print(df['Word_Count'].describe())
sns.catplot(x="Word_Count", data=df, kind="count", aspect=3)

Now convert the messages into conversation format. Messages are grouped by 3 min intervals.

In [None]:
def to_sharegpt_format(df):
    return [{'from': 'gpt' if row['Author'] == 'world.soup' else 'human', 'value': row['Message']} for _, row in df.iterrows()]

df['Conversation_ID'] = (df['Datetime'].diff() > pd.Timedelta('3min')).cumsum()
conversations = [df_group for _, df_group in df.groupby('Conversation_ID')]
results = [to_sharegpt_format(convo) for convo in conversations]
results = [convo for convo in results if len(convo) > 1]
print(len(results))

In [None]:
random.sample(results, 1)

Save the preprocessed data

In [None]:
import json
with open('conversations.json', 'w+') as f:
    json.dump(results, f, indent=4)