# Exploratory data analysis 1

**INPUT**: Stratified train, validation corpora

**OUTPUT**: Preprocessing decisions

| Step | Decision | Status | Comment |
|------|----------|--------|---------|
| Message length distribution | Remove outliers | Done | Use the 1.5xIQR (John Tukey) rule, lower is minus, add one manually |
| Vocabulary size estimation | Between 16-32k | Pending | For lstm start with low, llm can work with greater, read for heuristics |
| OOV rate estimation | Find patterns to replace | Pending | Chosen patterns: url, email, phone, uppercase. Have to fine tune regex patterns & normalize email and urls. 
| Repeated chars | Collapse them | Pending | Same char appearing > 3 times. Collapse them to unified 2 chars |
| Special chars | Keep some | Pending | Maybe ! ? % $ are meaningful keep them? Ask Consultant |


## Input & Setup

### imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

### Read train corpus

In [None]:
root = Path.cwd().parent
train_path = root / "data" / "corpora" / "raw" / "train_raw.csv"
validation_path = root / "data" / "corpora" / "raw" / "validation_raw.csv"
train = pd.read_csv(train_path)
validation = pd.read_csv(validation_path)

## Steps

### Message length distribution

In [None]:
msg_lengths = train["Message"].str.len()
char_length_statistics = msg_lengths.describe()

spam_msg_lengths = train[train['Spam/Ham'] == 'spam']['Message'].str.len()
spam_char_lengths_statistics = spam_msg_lengths.describe()

ham_msg_lengths = train[train['Spam/Ham'] == 'ham']['Message'].str.len()    
ham_char_lengths_statistics =ham_msg_lengths.describe()

print(char_length_statistics)


In [None]:
plt.figure()
plt.hist(ham_msg_lengths, bins=50)
plt.hist(spam_msg_lengths, bins=50)
plt.xlabel("Message length (characters)")
plt.ylabel("Count")
plt.title("Message Length Distribution")
plt.show()

In [None]:
q1 = char_length_statistics['25%']
q3 = char_length_statistics['75%']

def calculate_scaled_IQR(q1, q3, scaling_factor = 1.5):
    IQR = q3 -q1
    upper_boundary = int(q3 + scaling_factor*IQR)
    lower_boundary = int(q1 - scaling_factor*IQR)
    return upper_boundary, lower_boundary

upper, lower = calculate_scaled_IQR(q1,q3)
print(upper)
print(lower)

lower = 20 # Own decision
    

### Vocabulary size estimation

In [None]:
estimated_tokens_train_set = train['Message'].astype(str).str.split()
vocab = set(token for msg in estimated_tokens_train_set for token in msg)
vocab_size = len(vocab)
print(vocab_size)
list(vocab)


### Out of vocabulary rate (OOV) estimation

In [None]:
oov_tokens = 0

estimated_tokens_validation_set = validation['Message'].astype(str).str.split()

for message in estimated_tokens_validation_set:
    for token in message:
        if token not in vocab:
            oov_tokens += 1

oov_rate = oov_tokens / vocab_size
print(oov_rate)

### Patterns to replace

In [None]:
# TODO: revise regex after normalized URLs and EMAILs (they contain spaces)
regex_url = r'^https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)$'
regex_email = r'\b[a-zA-Z0-9](?:[a-zA-Z0-9._-]*[a-zA-Z0-9])?@[a-zA-Z0-9](?:[a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}\b'
# TODO: revise regex for phone, is it useale? Or leave only num?
regex_phone = r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
regex_num = r'\b\d+(?:\.\d+)?\b'
regex_uppercase = r'\b[a-z]*[A-Z]+[a-z]*[A-Z]+[a-z]*\b'
regex_repeated_char = r'(.)\1{2,}'
num_of_urls = train['Message'].str.count(regex_url).sum()
num_of_emails = train['Message'].str.count(regex_email).sum()
num_of_phone_numbers = train['Message'].str.count(regex_phone).sum()
num_of_numbers = train['Message'].str.count(regex_num).sum()
num_of_uppercase_words = train['Message'].str.count(regex_uppercase).sum()
repeated_char_count = num_of_urls = train['Message'].str.count(regex_repeated_char).sum()
print(num_of_urls)
print(num_of_emails)
print(num_of_phone_numbers)
print(num_of_numbers)
print(num_of_uppercase_words)
print(repeated_char_count)

