In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import nltk
import joblib

In [16]:
data = pd.read_csv('spam.csv', encoding='latin1')

In [17]:
data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [18]:
# renaming the cols
data.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [19]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [20]:
data['target'] = encoder.fit_transform(data['target'])


In [22]:
data = data.drop_duplicates(keep='first')


In [26]:
data['target'] = data['target'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

# Create and train pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline.fit(X_train, y_train)

# Save the model to a file
joblib.dump(pipeline, 'sms_spam_classifier.pkl')

# Evaluate model
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Load the model from the file
# loaded_pipeline = joblib.load('sms_spam_classifier.pkl')

# # Predict with the loaded model
# sample_sms = ["Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/12345 to claim now."]
# prediction = loaded_pipeline.predict(sample_sms)
# print("Prediction with loaded model:", prediction)


Accuracy: 0.9661508704061895
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       889
           1       1.00      0.76      0.86       145

    accuracy                           0.97      1034
   macro avg       0.98      0.88      0.92      1034
weighted avg       0.97      0.97      0.96      1034

