In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

Importing the Dataset


In [11]:
#importing the Data Set
df = pd.read_table('/content/SMSSpamCollection', header=None)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [0]:
#converting label column to numerical
y=df[0]
raw_text=df[1]
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [13]:
y_enc 

array([0, 0, 1, ..., 0, 0, 0])

#Text Pre-Processing


*  Replace URLs with 'httpaddr'
*   Replace email addresses with 'emailaddr'
*   Replace money symbols with 'moneysymb'
*   Replace phone numbers with 'phonenumbr'
*   Replace numbers with 'numbr'







In [14]:
print(raw_text)
#1 Replace email addresses with 'emailaddr'
processed = raw_text.str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b','emailaddr')
print(processed[0])
#2 Replace URLs with 'httpaddr'
processed = processed.str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)','httpaddr')
print(processed[0])
#3 Replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')
print(processed[0])
#4 Replace phone numbers with 'phonenumbr'    
processed = processed.str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b','phonenumbr')
print(processed[0])
#5 Replace numbers with 'numbr'    
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
print(processed[0])

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: 1, Length: 5572, dtype: object
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got am

Removing Punctuations ,white spaces and line breaks

In [15]:
#1 removing white spaces
processed = processed.str.replace(r'[^\w\d\s]', ' ')
print(processed[0])
#2 removing  line spaces
processed = processed.str.replace(r'\s+', ' ')
print(processed[0])
#3 removing punctuations
processed = processed.str.replace(r'^\s+|\s+?$', '')
print(processed[0])

Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   
Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat 
Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat


converting all words to similar cases

In [16]:
processed = processed.str.lower()
print(processed[0])


go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat


Removing stop words

In [18]:
!pip install -q wordcloud
import wordcloud

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
#Removing stop words
stop_words = nltk.corpus.stopwords.words('english')
#stop_words

In [26]:
# converting it to set to iterate faster and removing stop words
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in set(stop_words)))
processed[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

STEMMING

In [28]:
#stemming........? 
#"distribute", "distributing", "distributor" or "distribution". We can replace these four words with just "distribut"
# available, availabilityetc to avail
porter = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(porter.stem(term) for term in x.split()))
processed[0]


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

3. Feature Engineering

In [41]:
#Tokenization to create bag of words
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X_ngrams = vectorizer.fit_transform(processed)
vectorizer.vocabulary_

{'go': 20275,
 'jurong': 28072,
 'point': 45594,
 'crazi': 10662,
 'avail': 3135,
 'bugi': 6166,
 'great': 22009,
 'world': 65859,
 'la': 29098,
 'buffet': 6158,
 'cine': 8518,
 'got': 21515,
 'amor': 1624,
 'wat': 63503,
 'go jurong': 20512,
 'jurong point': 28073,
 'point crazi': 45599,
 'crazi avail': 10665,
 'avail bugi': 3138,
 'bugi great': 6169,
 'great world': 22146,
 'world la': 65882,
 'la buffet': 29099,
 'buffet cine': 6159,
 'cine got': 8529,
 'got amor': 21520,
 'amor wat': 1625,
 'go jurong point': 20513,
 'jurong point crazi': 28074,
 'point crazi avail': 45600,
 'crazi avail bugi': 10666,
 'avail bugi great': 3139,
 'bugi great world': 6170,
 'great world la': 22147,
 'world la buffet': 65883,
 'la buffet cine': 29100,
 'buffet cine got': 6160,
 'cine got amor': 8530,
 'got amor wat': 21521,
 'ok': 41896,
 'lar': 29266,
 'joke': 27859,
 'wif': 64825,
 'oni': 42643,
 'ok lar': 42016,
 'lar joke': 29287,
 'joke wif': 27878,
 'wif oni': 64856,
 'ok lar joke': 42019,
 'lar

In [43]:
X_ngrams.shape

(5572, 67491)

Training Model

In [61]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score 
import sklearn.metrics as metrics 
X_train, X_test, y_train, y_test = train_test_split(X_ngrams,y_enc,test_size=0.2,random_state=42,stratify=y_enc)
clf = svm.SVC(kernel='linear', C=1, gamma=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_test, y_pred)

0.9018181818181819

In [62]:
pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred),
    index=[['actual', 'actual'], ['spam', 'ham']],
    columns=[['predicted', 'predicted'], ['spam', 'ham']]
)


Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,spam,ham
actual,spam,964,2
actual,ham,25,124
