In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Drop the last three columns
df = df.iloc[:, :2]

# Rename the columns for clarity
df.columns = ['label', 'message']

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Remove rows with null values
df.dropna(inplace=True)

# Encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into features and labels
X = df['message']
y = df['label']

# Create a TfidfVectorizer and a Naive Bayes model pipeline
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
pipeline.fit(X, y)
# Save the trained pipeline to a file
with open('spam_classifier.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print("Model training complete and saved as spam_classifier.pkl")


Model training complete and saved as spam_classifier.pkl
