In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
%matplotlib inline

In [39]:

def textPreprocessing(data):
    removePunctuation = [ char.lower() for char in data if char not in string.punctuation ]
    sentences = ''.join(removePunctuation)
    words = sentences.split()
    removeStopwords = [ word for word in words if word not in stopwords.words('english')]
    return removeStopwords


### TFIDF functions 

def get_idf(count_vec,smooth_idf=True, keep_shape=False):
    shape= count_vec.shape
    counts= np.where(count_vec!= 0,1,0) # 1, where there is the term
    if smooth_idf:
        idf=np.log((len(counts+1)/(np.sum(counts,0)+1)))+1
    else:
        idf=np.log((len(counts)/(np.sum(counts,0))))+1
    if keep_shape:
        idf=np.repeat(idf,shape[0]).reshape((shape[1],shape[0])).T
    return idf


def get_tf(count_vec):
    shape= count_vec.shape
    tf= count_vec/np.repeat(np.sum(count_vec,1),shape[1]).reshape(shape)
    return tf
    
def get_tfidf(count_vec):
    shape= count_vec.shape
    idf=get_idf(count_vec,smooth_idf=True, keep_shape= True)
    tf= get_tf(count_vec)
    tfidf=tf*idf
    tfidf_norm=tfidf/np.repeat(np.sqrt(np.sum(tfidf**2,1)),shape[1]).reshape(shape)
    return tfidf_norm
    


In [40]:
data = pd.read_csv('smsspamcollection/SMSSpamCollection' , sep='\t' , names =['label','message'])

In [33]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [35]:
data.label.unique() #Binary Classification

array(['ham', 'spam'], dtype=object)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing)
finalWordVectorCreator = wordVector.fit(data['message'])

In [37]:
#finalWordVectorCreator.vocabulary_

In [38]:
len(finalWordVectorCreator.vocabulary_)

9530

In [52]:
bow = finalWordVectorCreator.transform(data['message'])
#Create Features
features = tfidfObject.transform(bow)

In [42]:
count_vec=bow.toarray() # this is count matrix , raw freq for each word in the document
count_vec.shape

(5572, 9530)

In [43]:
#Tf-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfObject = TfidfTransformer(smooth_idf=True).fit(bow)  # converted count vec to tfidf

In [44]:
tfidfObject.idf_

array([8.23939443, 8.5270765 , 8.93254161, ..., 8.93254161, 6.98663146,
       8.93254161])

In [45]:
idf=get_idf(count_vec,smooth_idf=True)
idf

array([8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
       8.93236215])

In [54]:
# get_idf is working well

In [46]:
idf=get_idf(count_vec,smooth_idf=True, keep_shape= True)
idf

array([[8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215],
       [8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215],
       [8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215],
       ...,
       [8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215],
       [8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215],
       [8.23921497, 8.52689705, 8.93236215, ..., 8.93236215, 6.98645201,
        8.93236215]])

In [50]:
tf= get_tfidf(count_vec)

  tf= count_vec/np.repeat(np.sum(count_vec,1),shape[1]).reshape(shape)


In [51]:
tf[tf!=0]

array([0.33293859, 0.25317127, 0.31782557, ..., 0.48924674, 0.69008533,
       0.53331029])

In [53]:
features.toarray()[features.toarray()!=0]

array([0.33293662, 0.25317137, 0.31782399, ..., 0.48924917, 0.6900826 ,
       0.5333116 ])

In [61]:
# my functions are working good 

In [55]:
#Create Model
from sklearn.naive_bayes import MultinomialNB #Best for Text Data Features
model = MultinomialNB()
model.fit(features,data['label'])

MultinomialNB()

In [56]:
model.score(features,data['label'])

0.9775664034458005

In [60]:
#Realtime Input Example
inputSMS = input("Enter SMS to check: ")
preprocessText = textPreprocessing(inputSMS)
bowText = finalWordVectorCreator.transform(preprocessText)
featureText = tfidfObject.transform(bowText)
predict = model.predict(featureText)[0]
print("Given SMS is a {} SMS".format(predict))

# win a guaranteed lottery

Enter SMS to check: win a guaranteed lottery
Given SMS is a spam SMS
