In [1]:
import re
import os
import pandas as pd

In [2]:
DATE_RE = '\\d\\d?/\\d\\d?/\\d\\d?'
TIME_RE = '\\d\\d?:\\d\\d? [AP]M'
SENDER_RE = '[ a-zA-Z0-9]+'

In [3]:
MSG_RE = re.compile(f'(?P<date>{DATE_RE}), (?P<time>{TIME_RE}) - (?P<sender>{SENDER_RE}): (?P<message>.+)')

In [4]:
WHATSAPP_LOG_DIR = '../data/WhatsAppRaw/'
WHATSAPP_RESULTS_DIR = '../data/WhatsAppCleaned/'

In [5]:
def create_str_from_log_file(log_file):
  content = ''
  with open(log_file, 'r', encoding='utf-8') as file:
    for line in file:
      line = line.strip()
      line = re.sub(r'â€¯', ' ', line, flags=re.UNICODE)
      if MSG_RE.match(line):
        if content != '':
          content += '\n'
        content += line
      else:
        content += ' '
        content += line
    return content

def create_df_from_str(content):
  dates = []
  times = []
  senders = []
  messages = []
  raw = []

  for match in MSG_RE.finditer(content):
    dates.append(match[1])
    times.append(match[2])
    senders.append(match[3])
    messages.append(match[4])
    raw.append(match[0])

  return pd.DataFrame({'date': dates, 'time': times, 'sender': senders, 'message': messages, 'raw': raw})

In [6]:
def add_context(chat_df, col_to_cat='raw', new_col_name='full_context', context_len=3):
  neg_cols_added = [f'{col_to_cat}_neg_{i}' for i in range(1, 1+context_len)]
  plus_cols_added = [f'{col_to_cat}_plus_{i}' for i in range(1, 1+context_len)]

  for i in range(1, context_len+1):
    chat_df[f'{col_to_cat}_neg_{i}'] = chat_df[col_to_cat].shift(-i)
    chat_df[f'{col_to_cat}_plus_{i}'] = chat_df[col_to_cat].shift(i)

  chat_df[new_col_name] = chat_df[[*neg_cols_added, col_to_cat, *plus_cols_added]].fillna('').agg('\n'.join, axis=1).str.strip()
  chat_df.drop(columns=[*neg_cols_added, *plus_cols_added], inplace=True)

In [7]:
for filename in os.listdir(WHATSAPP_LOG_DIR):
  if filename.endswith('.txt'):
    chat_name = filename[len('WhatsApp Chat with '):-len('.txt')]
    log_file = os.path.join(WHATSAPP_LOG_DIR, filename)
    content = create_str_from_log_file(log_file)
    log_df = create_df_from_str(content)
    log_df['PLATFORM'] = 'WhatsApp'
    log_df['MSG_ID'] = chat_name + ' '
    log_df['MSG_ID'].str.cat(log_df.index.astype(str))
    add_context(log_df)

    log_df.to_csv(os.path.join(WHATSAPP_RESULTS_DIR, f'{chat_name}.csv'), index=False)