# PROJECT : Spam Data Analytics

`WORK IN PROGRESS`

In [1]:
import pandas as pd 
import numpy as np 
import csv

In [2]:
# with open("./SMSSpamCollection.csv",'r+') as data:
#     for line in data:
#         print(line.rstrip())

In [3]:
df = pd.read_csv("./SMSSpamCollection.csv",sep='\t', quoting=csv.QUOTE_NONE,names=["label", "message"])

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [5]:
target = df['label']

## NLTK 

In [6]:
import nltk 
# nltk.download('all')

from nltk.tokenize import word_tokenize

## Tokenization 

In [7]:
def splitIntoTokens(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens


In [8]:
df['tokenized_message'] = df['message'].apply(splitIntoTokens)

## Lemmatization (convert a word into its base form)

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer

def getLemmas(tokens):
    lemmas = []
    lemmatizer = WordNetLemmatizer()
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

df['lemmatized_message'] = df['tokenized_message'].apply(getLemmas)

In [10]:
df.iloc[11]

label                                                              spam
message               SIX chances to win CASH! From 100 to 20,000 po...
tokenized_message     [six, chances, to, win, cash, !, from, 100, to...
lemmatized_message    [six, chance, to, win, cash, !, from, 100, to,...
Name: 11, dtype: object

## Removing Stop Words 

In [11]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [12]:
def removeStopWords(lemmas):
    filteredSentence = []
    filteredSentence = ' '.join([word for word in lemmas if word not in stopWords])
    return filteredSentence 

df['filtered_message'] = df['lemmatized_message'].apply(removeStopWords)

## Bag Of Words

## Term Document Matrix
- The Term Document Matrix (TDM) is a matrix that contains the frequency of occurrence of terms in a collection of documents.
- In a Term Frequency Inverse Document Frequency (TFIDF) matrix, the term importance is expressed by Inverse Document Frequency (IDF)
- IDF diminishes the weight of the most commonly occurring words and increases the weightage of rare words.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidfVectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df = (1/len(df['label'])), 
    max_df = 0.7
)


In [14]:
tfidfModel = tfidfVectorizer.fit(df['filtered_message'])

In [15]:
xMatrix = tfidfModel.transform(df['filtered_message']).toarray()

In [16]:
from sklearn.model_selection import train_test_split

xTrain,xTest,yTrain,yTest = train_test_split(xMatrix,df['label'],test_size=0.1,random_state=7)

## Decision Tree Classification

In [17]:
from sklearn.tree import DecisionTreeClassifier

seed = 7
dtClassifier = DecisionTreeClassifier(random_state=seed)

dtModel = dtClassifier.fit(xTrain,yTrain)

yPredictDT = dtModel.predict(xTest)

dtScore = dtClassifier.score(xTest,yTest)

print("Decision Tree Score :",dtScore)

Decision Tree Score : 0.967741935483871


## Gaussian Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

gnbClassifier = GaussianNB()

gnbModel = gnbClassifier.fit(xTrain,yTrain)

yPredictGNB = gnbModel.predict(xTest)

gnbScore = gnbModel.score(xTest,yTest)

print("Gaussian Naive Bayes Score :",gnbScore)

Gaussian Naive Bayes Score : 0.9086021505376344


## Stochastic Gradient Descent

In [19]:
from sklearn.linear_model import SGDClassifier

sgdClassifier = SGDClassifier(loss='modified_huber', shuffle=True,random_state=seed)

sgdModel = sgdClassifier.fit(xTrain,yTrain)

yPredictSGD = sgdModel.score(xTest,yTest)

sgdScore = sgdModel.score(xTest,yTest)

print("Stochastic Gradient Descent Classification score :",sgdScore)

Stochastic Gradient Descent Classification score : 0.9713261648745519


## Support Vector Machine

In [20]:
from sklearn.svm import SVC

svClassifier = SVC(kernel="linear", C=0.025,random_state=seed)

svModel = svClassifier.fit(xTrain, yTrain)

yPredictSV = svClassifier.predict(xTest)

svScore = svClassifier.score(xTest, yTest)

print('SVM Classifier : ',svScore)

SVM Classifier :  0.8566308243727598


## Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

rfClassifier = RandomForestClassifier(max_depth=5, n_estimators=15, max_features=60,random_state=seed)

rfModel = rfClassifier.fit(xTrain, yTrain)

yPredictRF = rfClassifier.predict(xTest)

rfScore = rfClassifier.score(xTest, yTest)

print('Random Forest Classifier : ',rfScore)

Random Forest Classifier :  0.8566308243727598


In [22]:
seed=7
from sklearn.model_selection import StratifiedShuffleSplit
###cross validation with 10% sample size
sss = StratifiedShuffleSplit(n_splits=1,test_size=0.1, random_state=seed)
sss.get_n_splits(xMatrix,df['label'])
print(sss)

StratifiedShuffleSplit(n_splits=1, random_state=7, test_size=0.1,
            train_size=None)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.multiclass import OneVsRestClassifier


classifiers = [
    DecisionTreeClassifier(),
    GaussianNB(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10),
    AdaBoostClassifier(),
   ]
for clf in classifiers:
    score=0
    for train_index, test_index in sss.split(xMatrix,df['label']):
        X_train, X_test = xMatrix[train_index], xMatrix[test_index]
        y_train, y_test = df['label'][train_index], df['label'][test_index]
        clf.fit(X_train, y_train)
        score=score+clf.score(X_test, y_test)
    print(clf,score)