In [375]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

# below the code is refered from multiple blogs and links

In [None]:
# Steps :
# 1 . Read the Dataset
# 2. Text Preprocessing
#      - Remove Puncations
#      - Remove stop words
#      - Tokenize the sentenc(Prepare the data in the format of list fo words)
# 3. Vectorization
#      -  CountVectorizer (Get the Bag of word)(Word Count)
# 4. Convert word count into TFIDF weight
# 5. Use MultinomialNB of Naybe Bayes to train the Model

In [315]:
data = pd.read_csv("DataSet\SMSSpamCollection",sep="\t", names=["label","message"])

In [316]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [317]:
data['label'].size

5572

In [318]:
# Ananlyze the data

In [319]:
data['label'].unique()

array(['ham', 'spam'], dtype=object)

In [320]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [321]:
# Ham Sample

data['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [322]:
# Spam Sample

data[data['label']=='spam']['message'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [323]:
data["length"] = data["message"].apply(len)

In [324]:
data[data['length'] == max(data['length'])]['message'].iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [325]:
data[data['length'] == min(data['length'])]['message'].iloc[0]

'Ok'

In [326]:
import string

# List of punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [361]:
class PreprocesssText(object):
    def ___init__(self):
        pass
    
    def remove_punctuations(self,text):
        '''Takes string and returns string'''
        return ''.join([x for x in text if x not in string.punctuation])
    
    def remove_stopwords(self,text):
        
        # split the text
        # lower case the words
        # consider if word not in stopword
        
        return ([x for x in text.split() if x.lower() not in stopwords.words('english')])
    
    def token_words(self,text=''):
        
        """
        Takes String
        Return Token also called  list of words that is used to 
        Train the Model 
        """
        
        message = self.remove_punctuations(text)
        words = self.remove_stopwords(message)
        return words

In [364]:
mess = data['message'].iloc[0]
obj = PreprocesssText()
words = obj.token_words(mess)
print(words)

['Go', 'jurong', 'point', 'crazy', 'Available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'got', 'amore', 'wat']


In [365]:
from sklearn.feature_extraction.text import CountVectorizer

In [366]:
data["message"].head(4).apply(obj.token_words)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
Name: message, dtype: object

In [367]:
bow_transformer = CountVectorizer(analyzer=obj.token_words).fit(data["message"])

In [368]:
messages_bow = bow_transformer.transform(data["message"])

In [369]:
print("Shape of sparese matrix {}".format(messages_bow.shape))

Shape of sparese matrix (5572, 11425)


In [376]:
# Converted word count into TFIDF think this a weight

tfidf_transformer = TfidfTransformer().fit(messages_bow)


In [378]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [379]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(messages_tfidf,data["label"])

In [421]:
predictions = model.predict(messages_tfidf)
pred = pd.DataFrame(data=predictions,columns=['Prediction'])

In [422]:
pd.concat([data['label'],pred['Prediction']], axis=1, keys=['label','Prediction'])

Unnamed: 0,label,Prediction
0,ham,ham
1,ham,ham
2,spam,spam
3,ham,ham
4,ham,ham
...,...,...
5567,spam,spam
5568,ham,ham
5569,ham,ham
5570,ham,ham
