In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from joblib import dump, load


In [24]:
# Load the data
df = pd.read_csv("spam.csv", encoding="latin-1")
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [29]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [30]:
df.v2.iloc[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [14]:
df['label'] = df['v1'].map({'ham': 1, 'spam': 0})

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['label'], test_size=0.2, random_state=42)


In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), # Convert messages to vectors of TF-IDF features
    ('clf', MultinomialNB()) # Train a Naive Bayes classifier
])

In [16]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [17]:
print("Accuracy on the test set:", pipeline.score(X_test, y_test))

Accuracy on the test set: 0.9623318385650225


In [18]:
# Dump the trained model to a file
dump(pipeline, 'spam_classifier.joblib')

['spam_classifier.joblib']

In [19]:
pipeline = load('spam_classifier.joblib')

In [20]:
new_messages = ['Free money!', 'Hey, what are you up to tonight?', 'Your order has shipped.']
predictions = pipeline.predict(new_messages)
for message, prediction in zip(new_messages, predictions):
    print(message, 'is', 'spam' if prediction == 1 else 'not spam')

Free money! is spam
Hey, what are you up to tonight? is spam
Your order has shipped. is spam
