In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
import string
import math


In [2]:
corpus = pd.read_csv("sms.csv")
corpus["message"] = corpus["message"].str.lower()

In [3]:
# src: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
def removeStopwords(sentence):
    
    sentence= re.sub('[^A-Za-z0-9]+', ' ', sentence)

    stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
    stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
    stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
    stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
    stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
    stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
    stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
    stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
    stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
    stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
    stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
    stopwords += ['yours', 'yourself', 'yourselves']
    return ' '.join([w for w in sentence.split() if w not in stopwords])

In [4]:
def clean_sms(string):
    sms = []
    regexp = r'^([^0-9]*)$'
    for i in nltk.word_tokenize(removeStopwords(str(string))):
        if re.match(regexp, i):
            word = nltk.PorterStemmer().stem(i.lower())
            sms += [word]
    return ' '.join(sms)

corpus["message"] = corpus["message"].apply(clean_sms)

In [5]:
wordlist = Counter()
corpus["message"].str.split().apply(wordlist.update)
wordlist

Counter({'paiy': 5,
         'udupi': 1,
         'daffodil': 2,
         'payabl': 17,
         'bicftd': 1,
         'song': 15,
         'saha': 1,
         'bankloan': 1,
         'ramayana': 1,
         'digibank': 6,
         'lokeshreddi': 1,
         'sreedhara': 1,
         'neelsandra': 4,
         'ur': 112,
         'cctv': 2,
         'membership': 32,
         'codenamemega': 1,
         'autotech': 1,
         'bhavishya': 2,
         'anshul': 1,
         'rang': 39,
         'sajjad': 1,
         'vani': 1,
         'gujjubhai': 3,
         'jayaprakash': 5,
         'mdfaiz': 1,
         'lookwel': 2,
         'rohan': 3,
         'shrikant': 1,
         'santro': 860,
         'jackpot': 4,
         'krishnamurthi': 1,
         'sei': 3,
         'pooja': 3,
         'chandra': 2,
         'rupesh': 1,
         'huge': 4,
         'victor': 3,
         'xj': 2,
         'upload': 32,
         'flash': 13,
         'sabudana': 91,
         'ranganath': 1,
         'ca

In [6]:
words = pd.DataFrame.from_dict(wordlist, orient='index').reset_index()
words = words.rename(columns={'index':'word', 0:'count'})
num_words = len(corpus)

In [7]:
words["idf"] = words.apply(lambda x: math.log10(num_words/(1 + x["count"])), axis=1)


In [8]:
def tf(s):
    d = dict(Counter(s))
    for k in d.keys():
        d[k] = d[k] / len(d)
    return d

corpus["tf"] = corpus.apply(lambda x: tf(x["message"].split()), axis=1)

In [9]:
words = words.set_index("word")

In [10]:
def tfidf(tf):
    tfidf = {}
    for k in tf.keys():
        tfidf[k] = tf[k] * words.loc(k)[k].idf
    return tfidf

corpus["tfidf"] = corpus.apply(lambda x: tfidf(x["tf"]), axis=1)

In [11]:
# Uncomment to save to CSV
# corpus.to_csv("corpus.csv")

In [13]:
#sim = np.zeros(shape=(len(corpus), len(corpus)))

#for i in range(len(corpus)):
#    for j in range(len(corpus)):
#        #x = corpus.loc(i)[i].tfidf
#        #y = corpus.loc(j)[j].tfidf
#        if i == j:
#            sim[i][j] = 1.0
#        if i > j:
#            sim[i][j] = sim[j][i]
#        else:
#            sim[i][j] = -1.0
#        
#sim

In [14]:
corpus

Unnamed: 0,id,sender,message,timestamp,tf,tfidf
0,29,HP-CARFIN,verif code,2016-10-10 09:45:49+00,"{'code': 0.160384795471, 'verif': 0.160384795471}","{'code': 0.160384795471, 'verif': 0.160384795471}"
1,29,RM-ICICIB,dear custom ac credit oct info mmt net avail b...,2016-10-10 08:33:51+00,"{'ac': 0.0320769590943, 'avail': 0.03207695909...","{'ac': 0.0320769590943, 'avail': 0.03207695909..."
2,29,VK-ICICIB,dear custom c credit rs oct c link mobil imp ref,2016-10-10 08:33:49+00,"{'mobil': 0.0320769590943, 'link': 0.032076959...","{'mobil': 0.0320769590943, 'link': 0.032076959..."
3,29,VK-ICICIB,ac debit atm cash wdl avbl bal bank phone imob...,2016-10-10 07:48:41+00,"{'debit': 0.0213846393962, 'ac': 0.02138463939...","{'debit': 0.0213846393962, 'ac': 0.02138463939..."
4,29,RM-ICICIB,ac debit atm cash wdl avbl bal bank phone imob...,2016-10-10 06:52:11+00,"{'debit': 0.0213846393962, 'ac': 0.02138463939...","{'debit': 0.0213846393962, 'ac': 0.02138463939..."
5,29,AM-TIKONA,suspens alert tikona wi bro account overdu dea...,2016-10-10 10:22:58+00,"{'tikona': 0.0188687994672, 'servic': 0.018868...","{'tikona': 0.0188687994672, 'servic': 0.018868..."
6,29,AM-TIKONA,receiv cash payment rs serviceid vivek singh b...,2016-10-10 11:03:34+00,"{'singh': 0.0267307992452, 'vivek': 0.02673079...","{'singh': 0.0267307992452, 'vivek': 0.02673079..."
7,29,VM-CARTRD,vw financ mumbai onlin auction live www cartra...,2016-10-10 11:06:57+00,"{'close': 0.0200480994339, 'bid': 0.0200480994...","{'close': 0.0200480994339, 'bid': 0.0200480994..."
8,29,VM-ICICIB,dear custom debit card purchas oct info iin ww...,2016-10-10 12:18:26+00,"{'debit': 0.0229121136388, 'net': 0.0229121136...","{'debit': 0.0229121136388, 'net': 0.0229121136..."
9,29,VK-ICICIB,dear custom debit card purchas oct info min bh...,2016-10-10 12:19:09+00,"{'debit': 0.0229121136388, 'min': 0.0229121136...","{'debit': 0.0229121136388, 'min': 0.0229121136..."
