In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
data = pd.read_csv('/content/spam-email-dataset.csv')

In [4]:
print(data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [6]:
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})
X = data['Message']
y = data['Category']

In [7]:
print(data.head())

   Category                                            Message
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=51)

In [10]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [15]:
print("Vectorized Data : \n",X_train_vectorized)

Vectorized Data : 
   (0, 6905)	1
  (0, 3424)	1
  (0, 6257)	1
  (0, 4951)	1
  (0, 1200)	1
  (0, 6261)	1
  (0, 1385)	1
  (0, 946)	1
  (0, 819)	1
  (0, 1584)	1
  (0, 7099)	1
  (1, 7099)	1
  (1, 3061)	1
  (1, 5913)	1
  (1, 3324)	1
  (1, 5689)	1
  (2, 4385)	1
  (2, 1473)	2
  (2, 4075)	1
  (2, 4122)	1
  (3, 4385)	1
  (3, 3285)	1
  (3, 6818)	1
  (3, 6924)	1
  (3, 3442)	1
  :	:
  (3898, 2503)	1
  (3899, 3424)	1
  (3899, 7099)	1
  (3899, 3061)	1
  (3899, 6948)	1
  (3899, 840)	1
  (3899, 4547)	1
  (3899, 1079)	1
  (3899, 4028)	1
  (3899, 6881)	1
  (3899, 865)	2
  (3899, 6372)	1
  (3899, 3071)	1
  (3899, 6606)	2
  (3899, 5114)	1
  (3899, 3049)	1
  (3899, 906)	1
  (3899, 1162)	1
  (3899, 5583)	1
  (3899, 3142)	1
  (3899, 961)	1
  (3899, 4573)	1
  (3899, 3755)	1
  (3899, 5518)	1
  (3899, 6728)	1


In [17]:
model = MultinomialNB()

In [18]:
model.fit(X_train_vectorized, y_train)

In [19]:
y_pred = model.predict(X_test_vectorized)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9814593301435407

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1452
           1       0.94      0.91      0.93       220

    accuracy                           0.98      1672
   macro avg       0.97      0.95      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [25]:
# manual testing

new_email = ["Congratulations! You've won a free iPhone. Click the link to claim."]
new_email_vectorized = vectorizer.transform(new_email)
prediction = model.predict(new_email_vectorized)
Predicion_probability = model.predict_proba(new_email_vectorized)

print("Prediction Probability : ",Predicion_probability)
print("Prediction for the new email:", "Spam" if prediction[0] == 1 else "Not Spam")

Prediction Probability :  [[1.94146304e-05 9.99980585e-01]]
Prediction for the new email: Spam


In [22]:
import pickle

with open('EmailSpam_Model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('Message_Vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)