# Natural Language Processing

## Importing the libraries

In [0]:
import numpy as np
import pandas as pd
import nltk 
import re
import sklearn

## Importing the dataset

In [0]:
dataset = pd.read_table('SMSSpamCollection',header=None,encoding='utf-8')
#Print useful information
print(dataset.head())

## Check Class Distribution

In [0]:
classes = dataset[0]
print(classes.value_counts())

## Preprocess data

In [0]:
#0 for ham and 1 for spam
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

In [0]:
#Store the SMS message data
text_messages = dataset[1]
print(text_messages[:10])

## Cleaning the texts

In [0]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace('[^.+@[^\.].*\.[a-z]{2,}$]',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [0]:
# Remove punctuation
processed = text_messages.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [0]:
# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()
print(processed)

In [130]:
nltk.download('stopwords')
from nltk.corpus import stopwords
# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

## Creating the Bag of Words model

In [132]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000) 
X = cv.fit_transform(processed).toarray()
y = dataset.iloc[:,0].values
print(X.shape)

(5572, 3000)


## Splitting the dataset into the Training set and Test set

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state =0)

## Training the Naive Bayes model on the Training set

In [134]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [0]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [136]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)
#here 1036+170 are identified correctly
#The matrix format is as shown :
#                            predicted:
#                        ham:         spam:
#    actual : ham  :     1032          176
#             spam :      16           169
#

[[1032  176]
 [  16  169]]


In [137]:
#Accuracy
(1036+172)/(1036+172+15+170)*100

86.71931083991386