In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

### 1. Load the dataset

In [2]:
# SMSSpamCollection

df = pd.read_table("SMSSpamCollection", header=None, encoding="utf-8")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
## Check the class distribution

classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Pre-process the data

In [8]:
## convert the class to  binary value, 0=spam, 1= ham

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

Y = encoder.fit_transform(classes)
print(Y[:10])
print(classes[:10])

[0 0 1 0 0 1 0 0 1 1]
0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object


In [11]:
## Store the sms message data

text_messages = df[1]
text_messages[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

In [14]:
## Regular expression to switch text from message, eg: email, phone, currency, urls

## email address with emailaddr

processed = text_messages.str.replace(r"^.+@[^\.].*\.[a-z]{2,}$", "emailaddr")

## replace urls with webaddrs

processed = processed.str.replace(r"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$", "webaddrs")

## replace currency with moneysymb 
processed = processed.str.replace(r"$", "moneysymb")

## replace phone number with phonenumbr 
processed = processed.str.replace(r"/^[0-9]\d{2,4}-\d{6,8}$/", "phonenumbr")

## replace normal numbers with numbr 
processed = processed.str.replace(r"\d+(\.\d+)?", "numbr")



In [15]:
## Remove punctuation
processed = processed.str.replace(r"[^\w\d\s]", " ")

## Remove spacess
processed = processed.str.replace(r"\s+", " ")

# Leadind and trailing whitespaces

processed = processed.str.replace(r"^\s+|\s+?$", "")


In [19]:
## Changing the word to lower case
processed = processed.str.lower()

In [None]:
### To be continue..