### building a spam classifier

In [1]:
import pandas as pd
import numpy as np

In [16]:
from pathlib import Path
import tarfile

def fetch_spam_data():
    spam_path = Path("Spam")
    if not spam_path.is_dir():
        raise FileNotFoundError(f"The folder {spam_path} does not exist.")

    easy_ham_path = spam_path / "easy_ham"
    spam_path = spam_path / "spam"
    
    if not easy_ham_path.is_dir() or not spam_path.is_dir():
        raise FileNotFoundError("The required subdirectories (easy_ham, spam) are missing in the Spam folder.")
    
    return [easy_ham_path, spam_path]

# Example usage
spam_data_paths = fetch_spam_data()
print("Easy Ham Path:", spam_data_paths[0])
print("Spam Path:", spam_data_paths[1])

Easy Ham Path: Spam/easy_ham
Spam Path: Spam/spam


In [17]:
ham_dir, spam_dir = fetch_spam_data()

In [18]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [19]:
len(ham_filenames)


2551

In [20]:
len(spam_filenames)

501

In [23]:
# Let's use python's email module to parse these emails (this handles headers, encoding and so on):
import email.parser
import email, email.policy

def load_email (filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [24]:
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [26]:
# Let's look at one of the examples
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [28]:
print(spam_emails[1].get_content().strip())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

In [29]:
# Some emails are actually mulipart, with images and attachemtns (which can have their own attachments). Let's look at the various types of structures we have:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        mulitpart = ", ".join([get_email_structure(sub_email)
                               for sub_email in payload])
        return f"mulipart({mulitpart})"
    else:
        return email.get_content_type()

In [30]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [31]:
structures_counter(ham_emails).most_common()

[('text/plain', 2453),
 ('mulipart(text/plain, application/pgp-signature)', 72),
 ('mulipart(text/plain, text/html)', 8),
 ('mulipart(text/plain, text/plain)', 4),
 ('mulipart(text/plain)', 3),
 ('mulipart(text/plain, application/octet-stream)', 2),
 ('mulipart(text/plain, text/enriched)', 1),
 ('mulipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('mulipart(mulipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('mulipart(text/plain, video/mng)', 1),
 ('mulipart(text/plain, mulipart(text/plain))', 1),
 ('mulipart(text/plain, application/x-pkcs7-signature)', 1),
 ('mulipart(text/plain, mulipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('mulipart(text/plain, mulipart(text/plain, text/plain), mulipart(mulipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('mulipart(text/plain, application/x-java-applet)', 1)]

In [32]:
structures_counter(spam_emails).most_common()

[('text/plain', 222),
 ('text/html', 181),
 ('mulipart(text/plain, text/html)', 45),
 ('mulipart(text/html)', 19),
 ('mulipart(text/plain)', 19),
 ('mulipart(mulipart(text/html))', 5),
 ('mulipart(text/plain, image/jpeg)', 3),
 ('mulipart(text/html, application/octet-stream)', 2),
 ('mulipart(text/plain, application/octet-stream)', 1),
 ('mulipart(text/html, text/plain)', 1),
 ('mulipart(mulipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('mulipart(mulipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [38]:
# Now we will take a look at the email headers
for header, value in spam_emails[3].items():
    print(header, ":", value)

Return-Path : <sabrina@mx3.1premio.com>
Delivered-To : zzzz@localhost.example.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.example.com (Postfix) with ESMTP id 1E90847C66	for <zzzz@localhost>; Thu, 22 Aug 2002 09:44:02 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 14:44:03 +0100 (IST)
Received : from email.qves.com (email1.qves.net [209.63.151.251] (may be forged))	by webnote.net (8.9.3/8.9.3) with ESMTP id OAA04953	for <zzzz@example.com>; Thu, 22 Aug 2002 14:37:23 +0100
Received : from qvp0086 ([169.254.6.17]) by email.qves.com with Microsoft SMTPSVC(5.0.2195.2966);	 Thu, 22 Aug 2002 07:36:20 -0600
From : Slim Down <sabrina@mx3.1premio.com>
To : zzzz@example.com
Subject : Guaranteed to lose 10-12 lbs in 30 days                          11.150
Date : Thu, 22 Aug 2002 07:36:19 -0600
Message-ID : <9a63c01c249e0$e5a9d610$1106fea9@freeyankeedom.com>
MIME-Versi

In [37]:
spam_emails[3]["Subject"]

'Guaranteed to lose 10-12 lbs in 30 days                          11.150'

In [39]:
# Before we learn to much about the data let's split the data into training and test 
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)

  text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)


In [41]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html>
<head>
</head>
<center>
<h1>
<b><font face="Arial Black"><font color="#0000FF"><font size=+2>&nbsp;
Free Personal and Business Grants</font></font></font></b></h1></center>

<p>&nbsp;
<center><table BORDER=0 CELLSPACING=0 CELLPADDING=10 WIDTH="419" BGCOLOR="#0000FF" >
<tr>
<td WIDTH="397" BGCOLOR="#FFFF00">
<center>
<h2>
<font face="Arial Narrow">" Qualify for <u>at least</u> $25,000 in free
grants money - Guaranteed! "</font></h2></center>
</td>
</tr>
</table></center>

<center>
<h3>
<font face="Arial"><font size=+0>Each day over One Million Dollars in Free
Government<br>
Grants&nbsp; is given away to people just like you for a wide<br>
variety of Business And Personal Needs</font></font></h3></center>
<font face="Verdana"><font size=-1>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
Dear Grant Seeker,</font></font>
<blockquote><font face="Verdana"><font size=-1>In a moment, I'll tell you
exactly <b>HOW &amp; WHERE</b> to get Grants. This <b>MONEY</b> has to
be given away, <b>WHY</b

In [42]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

TypeError: 'NoneType' object is not subscriptable

In [43]:
import numpy as np
import pandas as pd

# Check if NumPy is using the correct BLAS library
np.__config__.show()

# Perform a simple operation to see performance
arr = np.random.rand(1000, 1000)
%timeit np.dot(arr, arr)

Build Dependencies:
  blas:
    detection method: pkgconfig
    found: true
    include directory: /opt/arm64-builds/include
    lib directory: /opt/arm64-builds/lib
    name: openblas64
    openblas configuration: USE_64BITINT=1 DYNAMIC_ARCH=1 DYNAMIC_OLDER= NO_CBLAS=
      NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= SANDYBRIDGE MAX_THREADS=3
    pc file directory: /usr/local/lib/pkgconfig
    version: 0.3.23.dev
  lapack:
    detection method: internal
    found: true
    include directory: unknown
    lib directory: unknown
    name: dep4335021056
    openblas configuration: unknown
    pc file directory: unknown
    version: 1.26.4
Compilers:
  c:
    args: -fno-strict-aliasing, -DBLAS_SYMBOL_SUFFIX=64_, -DHAVE_BLAS_ILP64
    commands: cc
    linker: ld64
    linker args: -fno-strict-aliasing, -DBLAS_SYMBOL_SUFFIX=64_, -DHAVE_BLAS_ILP64
    name: clang
    version: 14.0.0
  c++:
    args: -DBLAS_SYMBOL_SUFFIX=64_, -DHAVE_BLAS_ILP64
    commands: c++
    linker: ld64
    linker