<a href="https://colab.research.google.com/github/OpenCodeEra/ML-X/blob/main/Email%20Spam%20Classification/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Email Spam Classification

## Data Collection and preprocessing

In [None]:
import os

In [None]:
dataset_dir = 'datasets'
model_dir = 'model'
tar_dir = os.path.join(dataset_dir, 'tar')

In [None]:
spams = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
nonspams_easy = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
nonspams_hard = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'

## download the dataset

In [None]:
from urllib.request import urlretrieve
import tarfile
import shutil

In [None]:
def download_dataset(url):

  # create directory if it doesn't exist
  if not os.path.isdir(tar_dir):
    os.makedirs(tar_dir)

  filename = url.rsplit('/', 1)[-1]
  tarpath = os.path.join(tar_dir, filename)

  try:
    tarfile.open(tarpath)
  except:
    urlretrieve(url, tarpath)

  with tarfile.open(tarpath) as tar:
    dirname = os.path.join(dataset_dir, tar.getnames()[0])

    if os.path.isdir(dirname):
      shutil.rmtree(dirname)
    tar.extractall(path=dataset_dir)

    cmds_path = os.path.join(dirname, 'cmds')
    if os.path.isfile(cmds_path):
      os.remove(cmds_path)

    return dirname

In [None]:
spam_dir = download_dataset(spams)
nonspam_easy_dir = download_dataset(nonspams_easy)
nonspam_hard_dir = download_dataset(nonspams_hard)

## Load dataset

In [None]:
import numpy as np
import glob

def load_dataset(dirpath):

  files = []
  filepaths = glob.glob(dirpath + '/*')
  for path in filepaths:
    with open (path, 'rb') as f:
      byte_content = f.read()
      str_content = byte_content.decode('utf-8', errors='ignore')
      files.append(str_content)

  return files

In [None]:
spams = load_dataset(spam_dir)
nonspam_easy = load_dataset(nonspam_easy_dir)
nonspam_hard = load_dataset(nonspam_hard_dir)

In [None]:
import sklearn.utils

X = spams + nonspam_easy + nonspam_hard

Y_one = np.ones(len(spams))
Y_zero = np.zeros(len(nonspam_easy) + len(nonspam_hard))

Y = np.concatenate((Y_one, Y_zero))

In [None]:
X, Y = sklearn.utils.shuffle(X, Y, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [None]:
print(len(X_train), len(Y_train))
print(len(X_test), len(Y_test))

2436 2436
610 610


# data preparation pipeline

In [None]:
def remove_header(email):
  return email[email.index('\n\n'):]