In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import confusion_matrix

In [55]:
data = pd.read_csv("spam.csv", encoding = "latin-1")
data = data[['v1', 'v2']]
data = data.rename(columns = {'v1': 'label', 'v2': 'text'})

In [56]:
def review_messages(msg):
    # converting messages to lowercase
    msg = msg.lower()
    return msg

In [57]:
data['text']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [58]:
# Processing text messages
data['text'] = data['text'].apply(review_messages)

In [59]:
# train test split 
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size = 0.1, random_state = 1)

In [60]:
data['text']

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: text, Length: 5572, dtype: object

In [61]:
# training vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [63]:
clf = MultinomialNB(alpha=.01)

#  Train the classifier
clf.fit(X_train, y_train)


# testing against testing set 
X_test_vec = vectorizer.transform(X_test)
y_pred = clf.predict(X_test_vec) 
print(confusion_matrix(y_test, y_pred))

[[489   1]
 [  6  62]]


In [64]:
X_test.reset_index(drop=True,inplace=True)

In [65]:
X_test[44]

'3 free tarot texts! find out about your love life now! try 3 for free! text chance to 85555 16 only! after 3 free, msgs å£1.50 each'

In [66]:
clf.predict(X_test_vec[44])

array(['spam'], dtype='<U4')

In [67]:
X_test[9]

"oh my god. i'm almost home"

In [68]:
clf.predict(X_test_vec[9])

array(['ham'], dtype='<U4')

In [69]:
X_test[8]

"have you heard about that job? i'm going to that wildlife talk again tonight if u want2come. its that2worzels and a wizzle or whatever it is?! "

In [70]:
clf.predict(X_test_vec[8])

array(['ham'], dtype='<U4')