# Spam Classifier 

In [1]:
# Import Libraries
import pandas as pd 
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## Load the data

In [3]:
messages = pd.read_csv("D:\Keras_Tutorial\RNN-And-NLP\SMSSpamCollection.txt" , sep="\t", names=["label" , "message"])

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning and Preprocessing

In [6]:
ps = PorterStemmer()
corpus = []
for i in range(0 , len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages["message"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

In [7]:
corpus

ure alex know birthday fifteen minut far concern',
 'sorri got thing may pub later',
 'nah straight bring bud drink someth actual littl use straight cash',
 'haha good hear offici paid market th',
 'mani lick take get center tootsi pop',
 'yup thk r e teacher said make face look longer darren ask cut short',
 'new textbuddi chat horni guy ur area p free receiv search postcod gaytextbuddi com txt one name',
 'today vodafon number end select receiv award number match call receiv award',
 'pleas dont say like hi hi hi',
 'thank u',
 'oh forward messag thought send',
 'got seventeen pound seven hundr ml hope ok',
 'dear voucher holder claim week offer pc go http www e tlp co uk expressoff ts cs appli stop text txt stop',
 'n funni',
 'sweetheart hope kind day one load reason smile biola',
 'login dat time dad fetch home',
 'shower babi',
 'askd u question hour answer',
 'well imma definit need restock thanksgiv let know',
 'said kiss kiss sound effect gorgeou man kind person need smile bri

## Creating a Bag of Words

and convert into independent feature

In [13]:
cv = CountVectorizer(max_features=5000) # i just take only 5000 column , i just only want top most frequently words 
X = cv.fit_transform(corpus).toarray()

In [14]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X.shape

(5572, 5000)

## Convert the label into dummie variable and convert into dependent feature



In [16]:
y = pd.get_dummies(messages["label"])
y = y.iloc[: , 1].values

In [17]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [18]:
# Split the data into train and test set
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

### Train the Model and make prediction

In [19]:
model = MultinomialNB()
model.fit(X_train , y_train)

MultinomialNB()

In [21]:
y_pred = model.predict(X_test)
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [22]:
y_test[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [23]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test , y_pred))

0.9811659192825112
