# Data collection

## imports

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from urllib.request import urlretrieve
import zipfile
from pathlib import Path

## Build data directory tree

In [None]:
root = Path.cwd().parent
directories = [
    "data",
    "data/corpora",
    "data/corpora/raw",
    "data/corpora/processed",
    "data/models"
    # TODO: add other directories if needed (example: data/results - for stroing model evaluationresults)
]

for dir_path in directories:
    Path(root /dir_path).mkdir(parents=True, exist_ok=True)

## Download raw corpus and unzip it

In [None]:
root = Path.cwd().parent
datasets_dir = root / "data" / "corpora" / "raw"

url = "https://github.com/MWiechmann/enron_spam_data/raw/master/enron_spam_data.zip"
zip_path = datasets_dir / "enron_spam_data.zip"

print("Downloading enron_spam_data.zip...")
urlretrieve(url, zip_path)
print("Download complete!")

print("Extracting...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(datasets_dir)
print("Extraction complete!")

csv_path = datasets_dir / "enron_spam_data.csv"
if csv_path.exists():
    print(f"\n✓ File extracted: {csv_path}")
else:
    print("\n✗ File not found after extraction")

## Read data and make a copy of it

In [None]:
raw_corpus_path = root / "data" / "corpora" / "raw" / "enron_spam_data.csv"
raw_corpus = pd.read_csv(raw_corpus_path)
corpus = raw_corpus.copy()
print(corpus.head())

## Columns and shape
drop Message ID and Date columns

In [None]:
shape = corpus.shape
columns = list(corpus.columns)
print(f"Number of entires: {shape[0]}, number of columns: {shape[1]}")
print(f'columns: {columns}')

In [None]:
corpus.drop(['Message ID', 'Date'], axis=1, inplace=True)
columns = list(corpus.columns)
print(columns)

## Detect empty values
negliglible -> delete rows that contains empty

In [None]:
empty_values = corpus.isnull().sum()
print(empty_values)

In [None]:
corpus = corpus.dropna()

## Detect duplicates
drop duplicate rows

In [None]:
count_duplicates = corpus.duplicated().sum()
print(count_duplicates)
duplicate_mask = corpus.duplicated()
duplicate_messages = corpus[duplicate_mask]
print(duplicate_messages)
corpus = corpus.drop_duplicates()

## Label balance inspection
almost 50-50 -> no action


In [None]:
num_of_labels = corpus['Spam/Ham'].value_counts().to_dict()
print(num_of_labels)
percentage_of_labels = {key:value/len(corpus) for key,value in num_of_labels.items()}
print(percentage_of_labels)

## Split into 3 sets

Sets -> train, validation, test

Use stratification, to preserve spam / ham balance

In [None]:
train, temporary = train_test_split(
    corpus,
    test_size=0.3,
    random_state=42,
    stratify=corpus['Spam/Ham']
)
validation, test = train_test_split(
    temporary,
    test_size=0.5,
    random_state=42,
    stratify=temporary['Spam/Ham']
)


## Saving corpora

In [None]:
train_path_raw = root / "data" / "corpora" / "raw" / "train_raw.csv"
validation_path_raw = root / "data" / "corpora" / "raw" / "validation_raw.csv"
test_path_raw = root / "data" / "corpora" / "raw" / "test_raw.csv"
train.to_csv(train_path_raw, index=False)
validation.to_csv(validation_path_raw, index=False)
test.to_csv(test_path_raw, index=False)