<a href="https://colab.research.google.com/github/S4vyss/machine-learning/blob/main/SpamDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("zestawy danych", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [2]:
fetch_spam_data()

In [3]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [4]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [5]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [6]:
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [17]:
import pandas as pd

ham = pd.DataFrame(columns=['subject', 'from', 'to', 'date', 'body', 'label'])
spam = pd.DataFrame(columns=['subject', 'from', 'to', 'date', 'body', 'label'])

def convert_to_dataframe(emails, df, label):

  dfs = []
  for email_obj in emails:
      subject = email_obj['Subject']
      sender = email_obj['From']
      recipient = email_obj['To']
      date = email_obj['Date']
      body = email_obj.get_payload()  # Get the email body content

      # Assign labels (1 for spam, 0 for non-spam) based on your criteria
      label = label  # Define the is_spam() function

      # Append data to DataFrame
      df = pd.DataFrame({'subject': [subject],
                       'from': [sender],
                       'to': [recipient],
                       'date': [date],
                       'body': [body],
                       'label': [label]})
      dfs.append(df)
  return pd.concat(dfs, ignore_index=True)

In [19]:
ham_df = convert_to_dataframe(ham_emails, ham, 0)
spam_df = convert_to_dataframe(spam_emails, spam, 1)

ham_df.head(10)

Unnamed: 0,subject,from,to,date,body,label
0,Re: New Sequences Window,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,"Thu, 22 Aug 2002 18:26:25 +0700","Date: Wed, 21 Aug 2002 10:54:46 -05...",0
1,[zzzzteana] RE: Alexander,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...","Thu, 22 Aug 2002 12:46:18 +0100","Martin A posted:\nTassos Papadopoulos, the Gre...",0
2,[zzzzteana] Moscow bomber,Tim Chapman <timc@2ubh.com>,zzzzteana <zzzzteana@yahoogroups.com>,"Thu, 22 Aug 2002 13:52:38 +0100",Man Threatens Explosion In Moscow \n\nThursday...,0
3,[IRR] Klez: The Virus That Won't Die,Monty Solomon <monty@roscom.com>,undisclosed-recipient:;,"Thu, 22 Aug 2002 09:15:25 -0400",Klez: The Virus That Won't Die\n \nAlready the...,0
4,Re: [zzzzteana] Nothing like mama used to make,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:38:22 +0100","> in adding cream to spaghetti carbonara, whi...",0
5,Re: [zzzzteana] Nothing like mama used to make,Martin Adamson <martin@srv0.ems.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:50:31 +0100",\n> I just had to jump in here as Carbonara is...,0
6,[zzzzteana] Playboy wants to go out with a bang,Martin Adamson <martin@srv0.ems.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:54:25 +0100",The Scotsman - 22 August 2002\n\n Playboy want...,0
7,Re: [zzzzteana] Nothing like mama used to make,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 15:01:20 +0100",Martin Adamson wrote:\n> \n> Isn't it just bas...,0
8,[zzzzteana] Meaningful sentences,Martin Adamson <martin@srv0.ems.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 15:01:33 +0100",The Scotsman\n\n Thu 22 Aug 2002 \n\n Meaningf...,0
9,[SAtalk] SA CGI Configurator Scripts,NOI Administrator <admin@networksonline.com>,spamassassin-talk@example.sourceforge.net,"Thu, 22 Aug 2002 10:16:36 -0400",I have been trying to research via SA mirrors ...,0


In [20]:
ham_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2500 non-null   object
 1   from     2500 non-null   object
 2   to       2348 non-null   object
 3   date     2500 non-null   object
 4   body     2500 non-null   object
 5   label    2500 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 117.3+ KB


In [21]:
spam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  500 non-null    object
 1   from     500 non-null    object
 2   to       500 non-null    object
 3   date     500 non-null    object
 4   body     500 non-null    object
 5   label    500 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 23.6+ KB


In [23]:
dataset = pd.concat([ham_df, spam_df])

In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  3000 non-null   object
 1   from     3000 non-null   object
 2   to       2848 non-null   object
 3   date     3000 non-null   object
 4   body     3000 non-null   object
 5   label    3000 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 164.1+ KB


In [26]:
dataset.describe(include='object')

Unnamed: 0,subject,from,to,date,body
count,3000,3000,2848,3000,3000
unique,2097,919,607,2742,2946
top,[Spambayes] test sets?,boingboing <rssfeeds@spamassassin.taint.org>,yyyy@spamassassin.taint.org,"Tue, 24 Sep 2002 08:00:09 -0000",Lowest rates available for term life insurance...
freq,27,109,626,5,4
