## imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

## Read train corpus

In [None]:
root = Path.cwd().parent
train_path = root / "datasets" / "train.csv"
validation_path = root / "datasets" / "validation.csv"
train = pd.read_csv(train_path)
validation = pd.read_csv(validation_path)

# Exploratory Data Analysis

### Message length distribution

Apply the 1.5xIQR rule after preprocessing. Max token number > 1.5IQR = 3353

Also drop char length < 20 (lower boundary from 1.5*IQR, so a rasonable email length)

The 1.5×IQR rule comes from John Tukey’s exploratory data analysis (EDA)

In [None]:
msg_lengths = train["Message"].str.len()
char_length_statistics = msg_lengths.describe()

spam_msg_lengths = train[train['Spam/Ham'] == 'spam']['Message'].str.len()
spam_char_lengths_statistics = spam_msg_lengths.describe()

ham_msg_lengths = train[train['Spam/Ham'] == 'ham']['Message'].str.len()    
ham_char_lengths_statistics =ham_msg_lengths.describe()

print(char_length_statistics)


In [None]:
plt.figure()
plt.hist(ham_msg_lengths, bins=50)
plt.hist(spam_msg_lengths, bins=50)
plt.xlabel("Message length (characters)")
plt.ylabel("Count")
plt.title("Message Length Distribution")
plt.show()

In [None]:
q1 = char_length_statistics['25%']
q3 = char_length_statistics['75%']

def calculate_scaled_IQR(q1, q3, scaling_factor = 1.5):
    IQR = q3 -q1
    upper_boundary = int(q3 + scaling_factor*IQR)
    lower_boundary = int(q1 - scaling_factor*IQR)
    return upper_boundary, lower_boundary

upper, lower = calculate_scaled_IQR(q1,q3)
print(upper)
print(lower)

lower = 20 # Own decision
    

### Vocabulary size estimation
reduce to -> 30k

In [None]:
estimated_tokens_train_set = train['Message'].astype(str).str.split()
vocab = set(token for msg in estimated_tokens_train_set for token in msg)
vocab_size = len(vocab)
print(vocab_size)
list(vocab)


### Out of vocabulary rate (OOV) estimation
high oov rate -> check for patterns that can be replaced

In [None]:
oov_tokens = 0

estimated_tokens_validation_set = validation['Message'].astype(str).str.split()

for message in estimated_tokens_validation_set:
    for token in message:
        if token not in vocab:
            oov_tokens += 1

oov_rate = oov_tokens / vocab_size
print(oov_rate)

### Patterns to replace

phone numbers, and numbers are replacable

uppercase words -> lowercase

urls, emails are not found. UPDATE: Space is used in them....

same char next to each other 3 times -> collapse them into unified 2

In [None]:
# TODO: revise regex after normalized URLs and EMAILs (they contain spaces)
regex_url = r'^https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)$'
regex_email = r'\b[a-zA-Z0-9](?:[a-zA-Z0-9._-]*[a-zA-Z0-9])?@[a-zA-Z0-9](?:[a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}\b'
# TODO: revise regex for phone, is it useale? Or leave only num?
regex_phone = r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
regex_num = r'\b\d+(?:\.\d+)?\b'
regex_uppercase = r'\b[A-Za-z]*\b'
regex_repeated_char = r'(.)\1{2,}'
num_of_urls = train['Message'].str.count(regex_url).sum()
num_of_emails = train['Message'].str.count(regex_email).sum()
num_of_phone_numbers = train['Message'].str.count(regex_phone).sum()
num_of_numbers = train['Message'].str.count(regex_num).sum()
num_of_uppercase_words = train['Message'].str.count(regex_uppercase).sum()
repeated_char_count = num_of_urls = train['Message'].str.count(regex_repeated_char).sum()
print(num_of_urls)
print(num_of_emails)
print(num_of_phone_numbers)
print(num_of_numbers)
print(num_of_uppercase_words)
print(repeated_char_count)



### Most frequent spam / ham tokens

stop words, and punctuations -> remove them


In [None]:
def tokenize_raw(text):
    return text.lower().split()

spam_tokens = []
ham_tokens = []

for _,row in train.iterrows():
    tokens = tokenize_raw(row['Message'])
    if row['Spam/Ham'] == 'spam':
        spam_tokens.extend(tokens)
    else:
        ham_tokens.extend(tokens)

spam_counter = Counter(spam_tokens).most_common(120)
ham_counter = Counter(ham_tokens).most_common(120)
print(spam_counter)
print(ham_counter)