In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm 
from sklearn.metrics import classification_report
import joblib

In [2]:
df = pd.read_csv("spam_ham_dataset.csv")

In [3]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [4]:
X = df["text"]
y = df["label_num"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
X_train

1951    Subject: hpl nom for february 1 , 2001\r\n( se...
3052    Subject: txu fuel nom . for 12 / 07 / 00\r\n( ...
4816    Subject: hpl nom for sept . 9 , 2000\r\n( see ...
3126    Subject: re : noms / actual flow for 03 / 14\r...
2327    Subject: enron / hpl actuals for october 30 , ...
                              ...                        
2772    Subject: wellheads\r\ndaren , cody has a new w...
3882    Subject: re : first delivery - wagner oil\r\nv...
3229    Subject: new profile ( otcbb : cinm ) the stoc...
3876    Subject: re : june 00 production - o ' connor ...
1292    Subject: fw : 05 / 01 ena gas sales on hpl , t...
Name: text, Length: 4136, dtype: object

In [6]:
X_test

2116    Subject: re : april spot tickets\r\nthe spot d...
2316    Subject: eops salary survey questionnaire\r\np...
4852    Subject: overseas pharmacy al\r\nwould you lik...
2714    Subject: re : revision - black marlin - meter ...
4564    Subject: re : copano p / l 01 / 00 - s 93481\r...
                              ...                        
2860    Subject: aol instant messenger reconfirmation\...
2343    Subject: enron / hpl actuals for september 1 -...
2798    Subject: re : enerfin meter 980439 for 10 / 00...
2975    Subject: gas day july 27 , 2000\r\nsent on beh...
4627    Subject: training resources and cost to attend...
Name: text, Length: 1035, dtype: object

In [7]:
cv = CountVectorizer()
features = cv.fit_transform(X_train)

In [8]:
model = svm.SVC()
model.fit(features,y_train)

In [9]:
features_test = cv.transform(X_test)
print("Accuracy: {}".format(model.score(features_test,y_test)))

Accuracy: 0.9777777777777777


In [10]:
y_pred = model.predict(features_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       711
           1       0.96      0.97      0.96       324

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [11]:
# Save the model and CountVectorizer
joblib.dump(model, 'spam_classifier_model.joblib')
joblib.dump(cv, 'count_vectorizer.joblib')

['count_vectorizer.joblib']

In [12]:
# Example using your own input
model = joblib.load('spam_classifier_model.joblib')
cv = joblib.load('count_vectorizer.joblib')

new_input = ["Congratulations! You've won a free iPhone. Click here to claim your prize!", "Note: Meeting will be adjourned till the 12th September", "Vorstellung: Bitte melden sie sich"]
new_input_features = cv.transform(new_input)
prediction = model.predict(new_input_features)

print("Prediction for new input:", prediction)

Prediction for new input: [1 0 1]
