In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [14]:

file_path = 'mail_data.csv'  # path for csv file 
data = pd.read_csv(file_path)


print(data.head())


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [15]:
data['Message'] = data['Message'].str.lower()  
data['Message'] = data['Message'].str.replace(r'[^\w\s]', '', regex=True) 
print(data['Message'].head())


0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: Message, dtype: object


In [16]:
data['Category'] = data['Category'].map({'spam': 1, 'ham': 0}) 
print(data['Category'].value_counts())


0    4825
1     747
Name: Category, dtype: int64


In [17]:
X = data['Message']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data size:", len(X_train))
print("Testing data size:", len(X_test))


Training data size: 4457
Testing data size: 1115


In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Vectorized training data shape:", X_train_vec.shape)


Vectorized training data shape: (4457, 8137)


In [19]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

print("Model training completed.")


Model training completed.


In [20]:
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9668161434977578
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [21]:
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [22]:
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

new_emails = ["Win a free iPhone now!", "Meeting at 3 PM tomorrow."]
new_emails_vec = vectorizer.transform(new_emails)
predictions = model.predict(new_emails_vec)

print("Predictions (1 = Spam, 0 = Not Spam):", predictions)


Predictions (1 = Spam, 0 = Not Spam): [1 0]
