# Data collection

## imports

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split


## Read data and make a copy of it

In [None]:
root = Path.cwd().parent
raw_corpus_path = root / "datasets" / "enron_spam_data.csv"
raw_corpus = pd.read_csv(raw_corpus_path)
corpus = raw_corpus.copy()
print(corpus.head())

## Columns and shape
drop Message ID and Date columns

In [None]:
shape = corpus.shape
columns = list(corpus.columns)
print(f"Number of entires: {shape[0]}, number of columns: {shape[1]}")
print(f'columns: {columns}')

In [None]:
corpus.drop(['Message ID', 'Date'], axis=1, inplace=True)
columns = list(corpus.columns)
print(columns)

## Detect empty values
negliglible -> delete rows that contains empty

In [None]:
empty_values = corpus.isnull().sum()
print(empty_values)

In [None]:
corpus = corpus.dropna()

## Detect duplicates
drop duplicate rows

In [None]:
count_duplicates = corpus.duplicated().sum()
print(count_duplicates)
duplicate_mask = corpus.duplicated()
duplicate_messages = corpus[duplicate_mask]
print(duplicate_messages)
corpus = corpus.drop_duplicates()

## Label balance inspection
almost 50-50 -> no action


In [None]:
num_of_labels = corpus['Spam/Ham'].value_counts().to_dict()
print(num_of_labels)
percentage_of_labels = {key:value/len(corpus) for key,value in num_of_labels.items()}
print(percentage_of_labels)

## Split into 3 sets

Sets -> train, validation, test

Use stratification, to preserve spam / ham balance

In [None]:
train, temporary = train_test_split(
    corpus,
    test_size=0.3,
    random_state=42,
    stratify=corpus['Spam/Ham']
)
validation, test = train_test_split(
    temporary,
    test_size=0.5,
    random_state=42,
    stratify=temporary['Spam/Ham']
)


## Saving corpora

In [None]:
train_path = root / "datasets" / "train.csv"
validation_path = root / "datasets" / "validation.csv"
test_path = root / "datasets" / "test.csv"

In [None]:
train.to_csv(train_path, index=False)
validation.to_csv(validation_path, index=False)
test.to_csv(test_path, index=False)