# Spam Detection Algorithm

In [59]:
# Import database files

import sys
import os
from dotenv import load_dotenv

load_dotenv()

base_dir = os.getenv('BASE_DIR')
database_dir = os.path.abspath(os.path.join(base_dir, 'app', 'database'))
sys.path.append(database_dir)

from database.models import load_data
from connections import list_collection


In [50]:
# Import Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import pickle

Data Collection from Database

In [51]:
# Load data from database
collections = list_collection()
sms = pd.DataFrame(columns=['Type', 'Content'])

# Load data from spam and ham collections and add them to one DataFrame
for c in collections:
    df = load_data(c)
    sms = pd.concat([sms, df], ignore_index=True)
    
# Scramble the Data Frame
sms = sms.sample(frac=1).reset_index(drop=True)

Data Cleaning

In [52]:
# Number spam and ham
sms['Type'] = sms['Type'].map({'spam': 0, 'ham': 1})

Split Data for Training and Testing

In [53]:
x = sms['Content']
y = sms['Type']

In [54]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

Text Vectorization

In [55]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector = vectorizer.transform(x_test)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

Train Model

In [56]:
model = MultinomialNB()
model.fit(x_train_vector, y_train)

y_pred = model.predict(x_test_vector)

Display Results

In [57]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Export as Byte Files using Pickle

In [58]:
with open("nb_model.pkl", 'wb') as model_file:
    pickle.dump(model, model_file)

with open("vectorizer.pkl", 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)