In [1]:
import pandas as pd

raw_data = pd.read_csv("data/spam_NLP.csv")

In [2]:
all_messages = raw_data["MESSAGE"]
all_category = raw_data["CATEGORY"]

In [4]:
sample_mail_content = all_messages.iloc[0]

with open("data/sample_spam_mail.txt", "w") as sample_mail_data:
    sample_mail_data.write(sample_mail_content)

# remove_punctuation

In [5]:
import string

all_messages_with_remove_punctuation = [
    el.translate(str.maketrans("", "", string.punctuation)) for el in all_messages
]

# tokenization

In [6]:
from nltk.tokenize import word_tokenize

try:
    all_messages_in_tokens = [
        word_tokenize(el) for el in all_messages_with_remove_punctuation
    ]
except LookupError:
    import nltk

    nltk.download("punkt")
    all_messages_in_tokens = [
        word_tokenize(el) for el in all_messages_with_remove_punctuation
    ]

# stemmer

In [7]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [8]:
all_messages_stemmer_list = [
    [ps.stem(el) for el in message] for message in all_messages_in_tokens
]

# remove_prepositions

In [9]:
from nltk.corpus import stopwords

try:
    stop_words = stopwords.words("english")
except:
    import nltk

    nltk.download("stopwords")
    stop_words = stopwords.words("english")

In [10]:
all_messages_filtered_text = [
    [el for el in message if el not in stop_words]
    for message in all_messages_stemmer_list
]

In [11]:
all_messages_as_text = [" ".join(message) for message in all_messages_filtered_text]

# Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [13]:
all_messages_vectorized = vectorizer.fit_transform(all_messages_as_text)

# Classifying model

##  train_test_split

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    all_messages_vectorized, all_category, test_size=0.2, random_state=42
)

# algorithm

In [15]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

svm_classifier = SVC(kernel="linear")
rf_classifier = RandomForestClassifier()

algorithms = {"SVC": svm_classifier, "RandomForestClassifier": rf_classifier}

In [16]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

best_algorithm = {"algorithm": None, "score": 0}

for key, value in algorithms.items():
    print(f"Algorithm: {key}")
    value.fit(X_train, y_train)

    accuracy = value.score(X_test, y_test)
    print("Classification accuracy: {:.10f}".format(accuracy))

    y_pred = value.predict(X_test)

    accuracy_sc = accuracy_score(y_test, y_pred)
    precision_sc = precision_score(y_test, y_pred)
    recall_sc = recall_score(y_test, y_pred)
    f1_sc = f1_score(y_test, y_pred)

    print("Accuracy:  ", accuracy_sc)
    print("Precision: ", precision_sc)
    print("Recall:    ", recall_sc)
    print("F1-score:  ", f1_sc)

    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Macierz pomyłek:")
    print(conf_matrix)

    if accuracy > best_algorithm["score"]:
        best_algorithm["algorithm"] = value
        best_algorithm["score"] = accuracy
    print("-------------------------------------")

print(f"The best algorithm is: {best_algorithm}")

Algorithm: SVC
Classification accuracy: 0.9896551724
Accuracy:   0.9896551724137931
Precision:  0.9873737373737373
Recall:     0.9824120603015075
F1-score:   0.9848866498740554
Macierz pomyłek:
[[757   5]
 [  7 391]]
-------------------------------------
Algorithm: RandomForestClassifier
Classification accuracy: 0.9827586207
Accuracy:   0.9827586206896551
Precision:  0.9846153846153847
Recall:     0.964824120603015
F1-score:   0.9746192893401016
Macierz pomyłek:
[[756   6]
 [ 14 384]]
-------------------------------------
The best algorithm is: {'algorithm': SVC(kernel='linear'), 'score': 0.9896551724137931}


# save the model

In [34]:
import pickle

# Save the trained model
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_algorithm["algorithm"], f)

# save the vectorizer

In [35]:
with open("vectorizer_model.pkl", "wb") as f:
    pickle.dump(vectorizer, f)