In [1]:
import numpy as np 
import pandas as pd

In [2]:
data=pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.shape

(5572, 2)

In [4]:
print(data["Category"].value_counts())

ham     4825
spam     747
Name: Category, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
data["Category_encoding"]=labelencoder.fit_transform(data["Category"])
data.head()

Unnamed: 0,Category,Message,Category_encoding
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

In [7]:
#apply message lower case
messages=data["Message"].str.lower().apply(word_tokenize)
print(messages)

0       [go, until, jurong, point, ,, crazy.., availab...
1                [ok, lar, ..., joking, wif, u, oni, ...]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, ..., u, c, alrea...
4       [nah, i, do, n't, think, he, goes, to, usf, ,,...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568      [will, ü, b, going, to, esplanade, fr, home, ?]
5569    [pity, ,, *, was, in, mood, for, that, ., so, ...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                  [rofl, ., its, true, to, its, name]
Name: Message, Length: 5572, dtype: object


In [8]:
#messages remove stopwords
def remove_stop_words(messages):
    no_stop = []
    for message in messages:
        if message not in stopwords.words('english'):
            no_stop.append(message)
    return no_stop
messages=messages.apply(remove_stop_words)

In [9]:
print(messages)

0       [go, jurong, point, ,, crazy.., available, bug...
1                [ok, lar, ..., joking, wif, u, oni, ...]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3       [u, dun, say, early, hor, ..., u, c, already, ...
4       [nah, n't, think, goes, usf, ,, lives, around,...
                              ...                        
5567    [2nd, time, tried, 2, contact, u., u, £750, po...
5568                [ü, b, going, esplanade, fr, home, ?]
5569           [pity, ,, *, mood, ., ..., suggestions, ?]
5570    [guy, bitching, acted, like, 'd, interested, b...
5571                                [rofl, ., true, name]
Name: Message, Length: 5572, dtype: object


In [10]:
# Define a function to lemmatization
def lemmatize(messages):
    lemmatizer = WordNetLemmatizer()
    # Create the lemmatized list
    lemmatized = []
    for message in messages:
            # Lemmatize and append
            lemmatized.append(lemmatizer.lemmatize(message))
    return " ".join(lemmatized)

messages = messages.apply(lemmatize)
print(messages)

0       go jurong point , crazy.. available bugis n gr...
1                         ok lar ... joking wif u oni ...
2       free entry 2 wkly comp win fa cup final tkts 2...
3             u dun say early hor ... u c already say ...
4               nah n't think go usf , life around though
                              ...                        
5567    2nd time tried 2 contact u. u £750 pound prize...
5568                        ü b going esplanade fr home ?
5569                     pity , * mood . ... suggestion ?
5570    guy bitching acted like 'd interested buying s...
5571                                     rofl . true name
Name: Message, Length: 5572, dtype: object


In [11]:
data["new_Message"]=messages
display(data)

Unnamed: 0,Category,Message,Category_encoding,new_Message
0,ham,"Go until jurong point, crazy.. Available only ...",0,"go jurong point , crazy.. available bugis n gr..."
1,ham,Ok lar... Joking wif u oni...,0,ok lar ... joking wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun say early hor ... u c already say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"nah n't think go usf , life around though"
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,2nd time tried 2 contact u. u £750 pound prize...
5568,ham,Will ü b going to esplanade fr home?,0,ü b going esplanade fr home ?
5569,ham,"Pity, * was in mood for that. So...any other s...",0,"pity , * mood . ... suggestion ?"
5570,ham,The guy did some bitching but I acted like i'd...,0,guy bitching acted like 'd interested buying s...


In [12]:
#train-test-split
from sklearn.model_selection import train_test_split
X=data["new_Message"]
Y=data["Category_encoding"]
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [13]:
#TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')
tfidf_train = vectorizer.fit_transform(x_train)
tfidf_test = vectorizer.transform(x_test)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [15]:
mNB=MultinomialNB()
mNB.fit(tfidf_train,y_train)
Y_pred = mNB.predict(tfidf_test)
print("Accuracy: "+str(accuracy_score(y_test, Y_pred)))
print('\n')
print(classification_report(y_test, Y_pred))
print('\n')
print(confusion_matrix(y_test, Y_pred))

Accuracy: 0.9713004484304932


              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



[[966   0]
 [ 32 117]]


In [16]:
from sklearn import svm
clf = svm.SVC()
clf.fit(tfidf_train,y_train)
Y_pred = clf.predict(tfidf_test)
print("Accuracy: "+str(accuracy_score(y_test, Y_pred)))
print('\n')
print(classification_report(y_test, Y_pred))
print('\n')
print(confusion_matrix(y_test, Y_pred))

Accuracy: 0.9838565022421525


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.88      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



[[966   0]
 [ 18 131]]
