In [29]:
import pandas as pd
df = pd.read_csv("spam.csv", encoding= 'latin-1')
df = df.loc [:, ~df.columns.str.contains('unnamed', case=False)]
df


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [30]:
df = df.rename(columns = {df.columns[0]: "label", df.columns[1]: "content"})
df= df[["label", "content"]]
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df

Unnamed: 0,label,content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [31]:
from sklearn.model_selection import train_test_split
x_train , x_test, y_train , y_test = train_test_split(df['content'], df['label'] )

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
lr_model = LogisticRegression(max_iter = 300)
lr_model.fit(x_train_tfidf, y_train)
y_pred = lr_model.predict(x_test_tfidf)
print(classification_report(y_test, y_pred))
print(f"accuracy :{accuracy_score(y_test, y_pred):.4f}")

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1207
           1       0.98      0.82      0.89       186

    accuracy                           0.97      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.97      0.97      0.97      1393

accuracy :0.9742


In [37]:
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()
NB_model.fit(x_train_tfidf, y_train)
y_pred_NB = lr_model.predict(x_test_tfidf)
print(classification_report(y_test, y_pred_NB))
print(f"accuracy :{accuracy_score(y_test, y_pred_NB):.4f}")

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1207
           1       0.98      0.82      0.89       186

    accuracy                           0.97      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.97      0.97      0.97      1393

accuracy :0.9742


In [39]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC()
svm_model.fit(x_train_tfidf, y_train)
y_pred_svm = svm_model.predict(x_test_tfidf)
print(classification_report(y_test, y_pred_svm))
print(f"accuracy :{accuracy_score(y_test, y_pred_svm):.4f}")

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.98      0.92      0.95       186

    accuracy                           0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393

accuracy :0.9871


In [43]:
zoner_content = ['WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461.','Even my brother is not like to speak with me. They treat me like aids patent.','Ok lar... Joking wif u oni...'
            ]
test_tfidf = vectorizer.transform(zoner_content)
y_pred = lr_model.predict(test_tfidf)
y_pred_NB = NB_model.predict(test_tfidf)
y_pred_svm = svm_model.predict(test_tfidf)
print ("predict if the content of message is spam / Notspam")

label_map = { 0 : 'not spam', 1 :'spam'}
for i, content in enumerate(zoner_content):
    print(f"\nMessage: {content}")
    print(f"Logistic regression Model : {label_map[y_pred[i]]}")
    print(f"Naive Bayes Prediction : {label_map[y_pred_NB[i]]}")
    print(f"SVM Model Prediction : {label_map[y_pred_svm[i]]}")


predict if the content of message is spam / Notspam

Message: WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461.
Logistic regression Model : spam
Naive Bayes Prediction : spam
SVM Model Prediction : spam

Message: Even my brother is not like to speak with me. They treat me like aids patent.
Logistic regression Model : not spam
Naive Bayes Prediction : not spam
SVM Model Prediction : not spam

Message: Ok lar... Joking wif u oni...
Logistic regression Model : not spam
Naive Bayes Prediction : not spam
SVM Model Prediction : not spam
