In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle

In [None]:
# Load the dataset
df = pd.read_csv("spam_ham_dataset.csv")

In [None]:
# Drop the column 'Unnamed: 0'
df = df.drop("Unnamed: 0", axis=1)

In [None]:
# Preprocess the text data
nltk.download('stopwords')
corpus = []
for i in range(0, len(df)):
    text = re.sub("[^a-zA-Z0-9]", " ", df["text"][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words("english"))]
    text = " ".join(text)
    corpus.append(text)

In [None]:
# Vectorize the text data
cv = CountVectorizer(max_features=35000)
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(df['label'])['spam'].values

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)



In [None]:
# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Save the model and vectorizer
pickle.dump(model, open("spam-sms-mnb-model.pkl", "wb"))
pickle.dump(cv, open('cv-transform.pkl', 'wb'))
