<a href="https://colab.research.google.com/github/Raghav-2003/spam-classifer/blob/main/spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [None]:
data = pd.read_csv('./email_classification.csv')
data.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [None]:
data.shape

(179, 2)

In [None]:
data['label'] = data['label'].map({'ham':0, 'spam' : 1})
data.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,0
1,Happy holidays from our team! Wishing you joy ...,0
2,We're hiring! Check out our career opportuniti...,0
3,Your Amazon account has been locked. Click her...,1
4,Your opinion matters! Take our survey and help...,0


In [None]:
data['email'][3]

'Your Amazon account has been locked. Click here to verify your account information.'

In [None]:
X = data['email']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Convert text to numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_vectorized)


In [None]:
# Train the Naive Bayes classifier
spam_classifier = MultinomialNB()
spam_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Evaluate the classifier on the test set
X_test_vectorized = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_vectorized)
y_pred = spam_classifier.predict(X_test_tfidf)

In [None]:
# Print the classification report and accuracy score
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        26
           1       0.97      1.00      0.98        28

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54

Accuracy: 0.9814814814814815


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = spam_classifier.predict(X_test_tfidf)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Calculate other evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Confusion Matrix:
[[25  1]
 [ 0 28]]
Accuracy: 0.9815
Precision: 0.9655
Recall: 1.0000
F1-Score: 0.9825


In [None]:
data['email'][0]

'Upgrade to our premium plan for exclusive access to premium content and features.'

In [None]:
sample_data = ['Upgrade to our premium plan for exclusive access to premium content and features.']
samp_vec = vectorizer.transform(sample_data)
samp_tfidf = tfidf_transformer.transform(samp_vec)
# Reshape samp_tfidf to a 2D array
samp_tfidf = samp_tfidf.reshape(1, -1)

samp_pred = spam_classifier.predict(samp_tfidf)

In [None]:
print(samp_pred)

[0]


In [None]:
# Save the trained model and vectorizer using pickle
import pickle
with open('spam_classifier.pkl', 'wb') as f:
    pickle.dump(spam_classifier, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('tfidf_transformer.pkl','wb') as f:
    pickle.dump(tfidf_transformer,f)