In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [3]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

In [5]:
ratio = spam_data[spam_data['target'] == 1].shape[0] *100/spam_data.shape[0]
ratio

13.406317300789663

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

#fit and transform the training data using a Count Vectorizer
#fit a multinomial Naive Bayes classifier model with smoothing alpha=0.1
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train_vectorized, y_train)
predictions = clf.predict_proba(vect.transform(X_test))
score = roc_auc_score(y_test, predictions[:,1])
score

0.991545422134696

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

#fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 3.
#fit a multinomial Naive Bayes classifier model with smoothing alpha=0.1
vect = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vectorized = vect.transform(X_train)
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train_vectorized, y_train)
predictions = clf.predict_proba(vect.transform(X_test))
score = roc_auc_score(y_test, predictions[:,1])
score

0.9954968337775665

In [21]:
#average length of documents (number of characters) for not spam and spam document
import statistics 
totalspam = []
spam_data[spam_data['target'] == 1].shape[0]
for i in range(0,spam_data[spam_data['target'] == 1].shape[0]):
    length = len(spam_data[spam_data['target'] == 1]['text'].iloc[i])
    totalspam.append(length)
spam = statistics.mean(totalspam)
totalnotspam = []
spam_data[spam_data['target'] == 0].shape[0]
for i in range(0,spam_data[spam_data['target'] == 0].shape[0]):
    length = len(spam_data[spam_data['target'] == 0]['text'].iloc[i])
    totalnotspam.append(length)
notspam = statistics.mean(totalnotspam)


In [31]:
#use the model on new messages - 1 = spam, 0 = not spam
print(clf.predict(vect.transform(['super discount',
                                    'buy one get one free'])))

[1 0]
