In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

#### Load the csv file, in this case we needed to add some encoding to it so that the unicode characters wouldn't break the 


In [None]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')

In [None]:
##  see what we're dealing with by getting the top 5 rows and columns

df.head()

We see that there are more than the columns we really need so let's subset them to the first two columns and all rows.  The format is iloc[ROW RANGE,COLUMN RANGE]

In [None]:
df_sms = df.iloc[:,0:2]

In [None]:
df_sms.head()

Let's relabel the columns

In [None]:
df_sms.columns = ['label', 'message']

In [None]:
df_sms.head()

Let's find out how many items labeled "spam" there are and use describe to get some more info like which sms message is the top spam message

In [None]:
df_sms.groupby('label').describe()

Let's find out the length of these messages and place the value of length into a new column

In [None]:
df_sms['length'] = df_sms.message.apply(len)

Let's feature engineer the "spam" "ham" label and make it binary but converting them to 1 , 0 into a new column.

In [None]:
df_sms['label_num'] = df_sms.label.map({'ham':0, 'spam':1})

In [None]:
df_sms.head()

Let's set up a text processor function that uses 'stopwords' module and will remove puncuation and any extra stopwords we define

In [None]:
def sms_text_process(mess):
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

We can view the result by applying it just to the first few rows

In [None]:
df_sms['message'].head(5).apply(sms_text_process)

Let's apply the processor to all the messages and create a new column with the new clean output

In [None]:
df_sms['message_clean'] = df_sms['message'].apply(sms_text_process)

In [None]:
df_sms.head()

In order to do predictive modeling, we must remove the context of the messages and agree that the context simply doesn't matter.

We don't necessarily care what the message is about, rather we need to determine a way to analyze the words and phrases in it.

We need to break down the sms messages into some form of numerical representation that the computer can ingest and analyze.

In NLP this is called tokenization whereby we count the number of times a word appears, no matter where it appears in a message.  Each new word gets a new token.  If a message has the same token structure, it's very likely it is a repeat, and therefore we can begin pattern recognition on that structure and determine if it is spam.

CountVectorizer helps us break down text content into a tokenized structure.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

Bag of Words is a name for the process (not the name of a module) that takes the content of each message and breaks down the words into numerical counts.  You lose context of the message and essentially now you have a bag of words and not a real sentence. 

We will apply this process to the clean messages.

In [None]:
bag_of_words = CountVectorizer(analyzer=sms_text_process).fit(df_sms['message_clean'])

In [None]:
print(bag_of_words.get_feature_names())

In [None]:
print(len(bag_of_words.vocabulary_))

In [None]:
print(bag_of_words.vocabulary_)

In [None]:
message_bagofwords = bag_of_words.transform(df_sms['message_clean'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_trans = TfidfTransformer().fit(message_bagofwords)

In [None]:
message_tfidf = tfidf_trans.transform(message_bagofwords)
print(message_tfidf.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf,df_sms['label'])

In [None]:
message = df_sms['message_clean'][4]
print(message)

In [None]:
bag_of_words_for_message = bag_of_words.transform([message])

In [None]:
tfidf = tfidf_trans.transform(bag_of_words_for_message)

In [None]:
row = 688
print('predicted', spam_detect_model.predict(tfidf)[0])
print('actual', df_sms.label[row])

In [None]:
df_sms[row:row+1]