In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [17]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [18]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [19]:
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [20]:
data=data[['v1', 'v2']]
data.columns=['label', 'message']
data['message']=data['message'].str.lower()
data['message']=data['message'].str.replace(r'\W', ' ', regex=True)
data['message']=data['message'].str.replace(r'\s+', ' ', regex=True)
data['message']=data['message'].str.strip()


In [21]:
# Encode labels
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [23]:
data.head()
#you can see here after converting our data is changed to lower case and all the commas full stops are removed

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i don t think he goes to usf he lives arou...


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   int64 
 1   message  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [24]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

In [8]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [26]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_test_tfidf)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_test_tfidf)

# SVM
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)

In [28]:
# Evaluation
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [29]:
# Naive Bayes evaluation
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(y_test, nb_predictions)
print(f"Naive Bayes - Accuracy: {nb_accuracy}, Precision: {nb_precision}, Recall: {nb_recall}, F1-Score: {nb_f1}")

Naive Bayes - Accuracy: 0.9668161434977578, Precision: 1.0, Recall: 0.7533333333333333, F1-Score: 0.8593155893536122


In [30]:
# Logistic Regression evaluation
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(y_test, lr_predictions)
print(f"Logistic Regression - Accuracy: {lr_accuracy}, Precision: {lr_precision}, Recall: {lr_recall}, F1-Score: {lr_f1}")

Logistic Regression - Accuracy: 0.968609865470852, Precision: 0.9914529914529915, Recall: 0.7733333333333333, F1-Score: 0.8689138576779026


In [31]:
# SVM evaluation
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(y_test, svm_predictions)
print(f"SVM - Accuracy: {svm_accuracy}, Precision: {svm_precision}, Recall: {svm_recall}, F1-Score: {svm_f1}")

SVM - Accuracy: 0.9802690582959641, Precision: 1.0, Recall: 0.8533333333333334, F1-Score: 0.920863309352518


In [32]:
# Save the best-performing model
best_model = lr_model  # Assuming Logistic Regression performed the best
joblib.dump(best_model, 'sms_spam_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [33]:
# Load the model and vectorizer for future use
loaded_model = joblib.load('sms_spam_classifier.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [35]:
# Classify a new message
new_message = ["Congratulations! You've won a free ticket to the Goa. Call now!"]
new_message_tfidf = loaded_vectorizer.transform(new_message)
prediction = loaded_model.predict(new_message_tfidf)
print("Spam" if prediction[0] else "Not Spam")

Spam
