## Imports

In [None]:
import pandas as pd
from pathlib import Path
import re

## Read corpora

In [None]:
root = Path.cwd().parent
train_path = root / "data" / "corpora" / "raw" / "train_raw.csv"
validation_path = root / "data" / "corpora" / "raw" / "validation_raw.csv"
test_path = root / "data" / "corpora" / "raw" / "test_raw.csv"
train = pd.read_csv(train_path)
validation = pd.read_csv(validation_path)
test = pd.read_csv(test_path)

# Preprocessing for LSTM

## Remove message length outliers

only on train data

In [None]:
msg_lengths = train["Message"].str.len()
q1 = msg_lengths.quantile(0.25)
q3 = msg_lengths.quantile(0.75)

def calculate_scaled_IQR(q1, q3, scaling_factor = 1.5):
    IQR = q3 -q1
    upper_boundary = int(q3 + scaling_factor*IQR)
    lower_boundary = int(q1 - scaling_factor*IQR)
    return upper_boundary, lower_boundary

upper, lower = calculate_scaled_IQR(q1,q3)
print(upper)
print(lower)

lower = 20 # Own decision
    

In [None]:
def remove_message_length_outliers(data, lower, upper):
    mask = data["Message"].str.len().between(lower, upper, inclusive='both')
    data = data[mask]
    return data
    
train = remove_message_length_outliers(train, lower=lower, upper=upper)

## Replace email, url, phone, num

In [None]:
#TODO: normalize email and url (contains spaces to obfuscate)
# TODO: revise regexes after normalization, repeating the todo in eda-1
regex_email = r'\b[a-zA-Z0-9](?:[a-zA-Z0-9._-]*[a-zA-Z0-9])?@[a-zA-Z0-9](?:[a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}\b'
regex_url = r'^https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)$'
regex_phone = r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
regex_num = r'\b\d+(?:\.\d+)?\b'

def preserve_email_url_phone_num(message):
    message = str(message)
    message = re.sub(regex_email, '<MAIL>', message)
    message = re.sub(regex_url, '<URL>', message)
    message = re.sub(regex_phone, '<PHONE>', message)
    message = re.sub(regex_num, '<NUM>', message)
    return message

train['Message'] = train['Message'].apply(preserve_email_url_phone_num)
validation['Message'] = validation['Message'].apply(preserve_email_url_phone_num)
test['Message'] = test['Message'].apply(preserve_email_url_phone_num)

## Repeated chars to unified length

In [None]:
def collapse_repeated_chars_and_spaces(message):
    message = str(message)
    message = re.sub(r'\s+', ' ', message).strip()
    message = re.sub(r'(.)\1{2,}', r'\1\1', message)
    return message

train['Message'] = train['Message'].apply(collapse_repeated_chars_and_spaces)
validation['Message'] = validation['Message'].apply(collapse_repeated_chars_and_spaces)
test['Message'] = test['Message'].apply(collapse_repeated_chars_and_spaces)
    

## Save lstm preprocessed corpora

In [None]:
train_path_processed = root / "data" / "corpora" / "processed" / "train-pp.csv"
validation_path_processed = root / "data" / "corpora" / "processed" / "validation-pp.csv"
test_path_processed = root / "data" / "corpora" / "processed" / "test-pp.csv"
train.to_csv(train_path_processed, index=False)
validation.to_csv(validation_path_processed, index=False)
test.to_csv(test_path_processed, index=False)