## SPAM FILTER

In [None]:
import tarfile
import os
import numpy as np

In [None]:
ham_list = [
    '20030228_easy_ham.tar.bz2',
    '20030228_easy_ham_2.tar.bz2',
    '20030228_hard_ham.tar.bz2'
]

spam_list = [
    '20030228_spam.tar.bz2',
    '20050311_spam_2.tar.bz2'
]
data_file = 'data'
ham_file = os.path.join(data_file, 'ham')
spam_file = os.path.join(data_file, 'spam')


In [None]:
def extract_file(archive, path_file):
    with tarfile.open(archive, "r:bz2") as tar:
        tar.extractall(path=path_file)

    os.remove(archive)

In [None]:
def get_data(path_file):
    data = []
    for root, dirs, files in os.walk(path_file):
        for file in files:
            file_path = os.path.join(root, file)
            if os.path.isfile(file_path):
                with open(file_path, encoding="ISO-8859-1") as f:
                    words_list = f.read()
                    data.append(words_list)
    return data

In [None]:
for archive in ham_list:
    extract_file(archive, ham_file)

ham = get_data(ham_file)

In [None]:
for archive in spam_list:
    extract_file(archive, spam_file)

spam = get_data(spam_file)

In [None]:
print(f'Количество HAM: {len(ham)}\nКоличество SPAM: {len(spam)}\nВсего: {len(ham)+len(spam)}')

Количество HAM: 4153
Количество SPAM: 1898
Всего: 6051


In [None]:
np.random.shuffle(ham)
np.random.shuffle(spam)


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re
from bs4 import BeautifulSoup


In [None]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_email(email):
    email = BeautifulSoup(email, "html.parser").get_text()

    email = re.sub(r'^(From|To|Subject|Date|Return-Path|Received|Message-Id|X-\w+):.*', '', email, flags=re.MULTILINE)

    email = email.lower()

    email = re.sub(r'http\S+|www\S+|https\S+', '', email)
    email = re.sub(r'\S+@\S+', '', email)
    email = re.sub(r'[^a-z\s]', '', email)

    words = word_tokenize(email)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [None]:
processed_ham_data = [preprocess_email(email) for email in ham]
processed_spam_data = [preprocess_email(email) for email in spam]

In [None]:
y = len(processed_ham_data)*[0] + len(processed_spam_data)*[1]
X = processed_ham_data + processed_spam_data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)


In [None]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train).toarray()
y_train = np.array(y_train).reshape(len(y_train), 1)

X_test = vectorizer.transform(X_test).toarray()
y_test = np.array(y_test).reshape(len(y_test), 1)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [None]:
y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       831
           1       0.97      0.97      0.97       380

    accuracy                           0.98      1211
   macro avg       0.98      0.98      0.98      1211
weighted avg       0.98      0.98      0.98      1211

[[819  12]
 [ 12 368]]
