# Spam Classification

### Data Import

In [1]:
import pandas as pd

In [2]:
text = pd.read_csv("smsspamcollection/SMSSpamCollection.tsv",sep='\t',header=None)

In [3]:
text

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
spam = text[0]
msg = text[1]

In [5]:
spam.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

### Data Preprocessing

In [6]:
from sklearn.preprocessing import LabelBinarizer


In [7]:
lb = LabelBinarizer()
spam  = lb.fit_transform(spam)

In [8]:
spam

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [9]:
import re 
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sshar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def remove_special_characters(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [11]:
msg = msg.apply(remove_special_characters)

In [12]:
def lower_case(text):
    return text.lower()

In [13]:
msg = msg.apply(lower_case)

In [14]:
msg

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                   will  b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [15]:
stop_words = nltk.corpus.stopwords.words("English")

In [16]:
def stop(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

In [17]:
msg = msg.apply(stop)

In [18]:
msg

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u u 750 pound prize 2...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

In [19]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [20]:
def clean(text):
    text = WordNetLemmatizer().lemmatize(text)
    text = PorterStemmer().stem(text)
    return text

In [21]:
msg = msg.apply(clean)

In [22]:
msg

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u u 750 pound prize 2...
5568                             b going esplanade fr hom
5569                              pity mood soany suggest
5570    guy bitching acted like id interested buying s...
5571                                        rofl true nam
Name: 1, Length: 5572, dtype: object

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
msg_train,msg_test,spam_train,spam_test = train_test_split(msg,spam,train_size=0.8)

In [25]:
msg_test

308     lol yes friendship hanging thread cause u wont...
1423    congratulations ur awarded either 500 cd gift ...
214                                           yup noe leh
4591    tomorrow going theatre come wherever u call te...
2651                       like shaking booty dance floor
                              ...                        
4457    want mapquest something look usf dogwood drive...
4511                       weekend fine excuse much decor
5103    news hassling get weed week andres money haugh...
3888                cab availablethey pick drop door step
5446    back good journey let know need receipts shall...
Name: 1, Length: 1115, dtype: object

## BOW and TF-IDF

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
#BOW
cv = CountVectorizer(ngram_range=(1, 3),max_df=0.8,min_df=5)
msg_cv_train = cv.fit_transform(msg_train)
msg_cv_test = cv.transform(msg_test)

In [28]:
print(msg_cv_test.shape)

(1115, 1999)


In [29]:
#TFIDF
tf = TfidfVectorizer(ngram_range=(1, 3),max_df=0.8,min_df=5)
msg_tf_train = tf.fit_transform(msg_train)
msg_tf_test = tf.transform(msg_test)

In [30]:
print(msg_tf_test.shape)

(1115, 1999)


## Naive_bayes 

In [31]:
from sklearn.naive_bayes import MultinomialNB
gb_cv = MultinomialNB()
gb_tf = MultinomialNB()

In [32]:
gb_cv.fit(msg_cv_train,spam_train)
prediction_cv = gb_cv.predict(msg_cv_test)

  return f(**kwargs)


In [33]:
gb_tf.fit(msg_tf_train,spam_train)
prediction_tf = gb_tf.predict(msg_tf_test)

  return f(**kwargs)


In [34]:
prediction_cv

array([0, 1, 0, ..., 0, 0, 0])

## Evaluation

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [36]:
def print_evaluation_scores(y_test, predicted):
    accuracy = accuracy_score(y_test,predicted)
    print(f1_score(y_test,predicted))
    print(roc_auc_score(y_test,predicted))
    print(average_precision_score(y_test,predicted))
    print(recall_score(y_test,predicted))

In [37]:
print("Evaluation for Bag of Words implimentation")
print_evaluation_scores(spam_test,prediction_cv)
print("Evaluation for TF-IDF implimentation")
print_evaluation_scores(spam_test,prediction_tf)

Evaluation for Bag of Words implimentation
0.9583333333333333
0.9708839091192032
0.9266047042297387
0.9470588235294117
Evaluation for TF-IDF implimentation
0.9245283018867924
0.93182384064737
0.8794910777303285
0.8647058823529412


In [38]:
# bag of words implimentation works great with 96% accuarcy