In [2]:
import pandas as pd
import re
import nltk
import joblib

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rolir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
messages = pd.read_csv(
    "dataset/SMSSpamCollection",
    sep="\t",
    names=["label", "message"],
    encoding="latin-1"
)

messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
ps = PorterStemmer()
corpus = []

for msg in messages["message"]:
    review = re.sub('[^a-zA-Z]', ' ', msg)
    review = review.lower().split()
    review = [
        ps.stem(word)
        for word in review
        if word not in stopwords.words('english')
    ]
    corpus.append(' '.join(review))


In [5]:
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(messages["label"], drop_first=True).values.ravel()


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9847533632286996


In [7]:
joblib.dump(model, "../backend/model/spam_model.pkl")
joblib.dump(cv, "../backend/model/vectorizer.pkl")


['../backend/model/vectorizer.pkl']