# Spam Detection

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('spam.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
len(df)

5572

In [5]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# Balance Data

In [7]:
ham=df[df['label']=='ham']
ham.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
6,ham,Even my brother is not like to speak with me. ...,77,2


In [8]:
spam=df[df['label']=='spam']
spam.head()

Unnamed: 0,label,message,length,punct
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136,8


In [10]:
ham.shape, spam.shape

((4825, 4), (747, 4))

In [48]:
ham=ham.sample(spam.shape[0])
ham.shape, spam.shape

((747, 4), (747, 4))

In [13]:
data = ham.append(spam,ignore_index=True)
data.tail()

Unnamed: 0,label,message,length,punct
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...,160,8
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...,147,3
1493,spam,This is the 2nd time we have tried 2 contact u...,160,8


### Data Preparation

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.pipeline import Pipeline

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Hi elaine, is today's meeting confirmed?",40,3
1,ham,"K, wait chikku..il send aftr &lt;#&gt; mins",44,8
2,ham,How much she payed. Suganya.,28,2
3,ham,Another month. I need chocolate weed and alcohol.,49,2
4,ham,S now only i took tablets . Reaction morning o...,50,2


In [38]:
x_train, x_test,y_train,y_test=train_test_split(data['message'],data['label'],test_size=0.3,random_state=0, shuffle=True, stratify=data['label'])

### Bag of words Creation

In [30]:
vectorizer = TfidfVectorizer()

In [31]:
x_train=vectorizer.fit_transform(x_train)

In [33]:
x_train.shape

(1045, 3704)

In [34]:
x_train

<1045x3704 sparse matrix of type '<class 'numpy.float64'>'
	with 17934 stored elements in Compressed Sparse Row format>

### Pipeline and RF

In [36]:
clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier(n_estimators=100,n_jobs=-1))])

In [39]:
clf.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier(n_jobs=-1))])

In [40]:
y_pred=clf.predict(x_test)

In [41]:
confusion_matrix(y_test,y_pred)

array([[222,   3],
       [ 23, 201]], dtype=int64)

In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.91      0.99      0.94       225
        spam       0.99      0.90      0.94       224

    accuracy                           0.94       449
   macro avg       0.95      0.94      0.94       449
weighted avg       0.95      0.94      0.94       449



In [43]:
accuracy_score(y_test,y_pred)

0.9420935412026726

In [44]:
clf.predict(["hi iam Nithya"])

array(['ham'], dtype=object)

In [49]:
clf.predict(["congrajulations!, you have won free tickets to USA."])

array(['spam'], dtype=object)