In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/email-classification-nlp/SMS_train.csv
/kaggle/input/email-classification-nlp/SMS_test.csv


In [2]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

# from wordcloud import WordCloud, STOPWORDS

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [3]:
train = pd.read_csv("../input/email-classification-nlp/SMS_train.csv", encoding='cp1252')
train

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam
...,...,...,...
952,953,hows my favourite person today? r u workin har...,Non-Spam
953,954,How much you got for cleaning,Non-Spam
954,955,Sorry da. I gone mad so many pending works wha...,Non-Spam
955,956,Wat time ü finish?,Non-Spam


In [4]:
test = pd.read_csv("../input/email-classification-nlp/SMS_test.csv", encoding='cp1252')
test

Unnamed: 0,S. No.,Message_body,Label
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam
3,4,URGENT! Your Mobile number has been awarded wi...,Spam
4,5,Someone has contacted our dating service and e...,Spam
...,...,...,...
120,121,7 wonders in My WORLD 7th You 6th Ur style 5th...,Non-Spam
121,122,Try to do something dear. You read something f...,Non-Spam
122,123,Sun ah... Thk mayb can if dun have anythin on....,Non-Spam
123,124,"SYMPTOMS when U are in love: ""1.U like listeni...",Non-Spam


In [5]:
# No missing Data
print(train.isnull().sum())
print(test.isnull().sum())

S. No.          0
Message_body    0
Label           0
dtype: int64
S. No.          0
Message_body    0
Label           0
dtype: int64


In [6]:
# Non-Spam = 0 & Spam = 1
train["Label"] = train['Label'].replace({'Non-Spam':0,'Spam':1})
test["Label"] = test['Label'].replace({'Non-Spam':0,'Spam':1})

In [7]:
print(train["Label"].value_counts())
print(test["Label"].value_counts())

0    835
1    122
Name: Label, dtype: int64
1    76
0    49
Name: Label, dtype: int64


# Preprocessing

In [8]:
train_mails = train["Message_body"]
test_mails = test["Message_body"]

In [9]:
# Make Corpus
Lemmatizer = WordNetLemmatizer()

train_corpus = []

for mail in train_mails:
    no_punc = re.sub('[^a-zA-Z]',' ', mail)
    no_punc = no_punc.lower()
    words = nltk.word_tokenize(no_punc)
    lemma = [Lemmatizer.lemmatize(word) for word in words if not word in stopwords.words('english')]
    lemma = ' '.join(lemma)
    train_corpus.append(lemma)
    
test_corpus = []

for mail in test_mails:
    no_punc = re.sub('[^a-zA-Z]',' ', mail)
    no_punc = no_punc.lower()
    words = nltk.word_tokenize(no_punc)
    lemma = [Lemmatizer.lemmatize(word) for word in words if not word in stopwords.words('english')]
    lemma = ' '.join(lemma)
    test_corpus.append(lemma)

In [10]:
# Merge for the corpuses for BOW
merged_corpus = train_corpus + test_corpus
merged_corpus

['rofl true name',
 'guy bitching acted like interested buying something else next week gave u free',
 'pity mood suggestion',
 'b going esplanade fr home',
 'nd time tried contact u u pound prize claim easy call p per minute bt national rate',
 'reminder get pound free call credit detail great offer pls reply text valid name house postcode',
 'huh lei',
 'wait til least wednesday see get',
 'ard like dat lor',
 'ok lor sony ericsson salesman ask shuhui say quite gd use considering',
 'get dump heap mom decided come lowes boring',
 'anything lor juz u lor',
 'next lt gt hour imma flip shit',
 'sorry call later',
 'meant calculation lt gt unit lt gt school really expensive started practicing accent important decided year dental school nmde exam',
 'yes u texted pshew missing much',
 'yeh indian nice tho kane bit shud go drink sometime soon mite hav go da work laugh soon love pete x x',
 'well keep mind got enough gas one round trip barring sudden influx cash',
 'hahaha use brain dear',


In [11]:
# Make the Bag
tokenizer = Tokenizer()
tokenizer.fit_on_texts(merged_corpus)
word_index = tokenizer.word_index
print(f"Total number of unique words present in dataset : {len(word_index)}")

Total number of unique words present in dataset : 2887


In [12]:
# Pad the data for train and test

train_token = tokenizer.texts_to_sequences(train_corpus)
test_token = tokenizer.texts_to_sequences(test_corpus)

max_length = max(max(len(x) for x in test_token), max(len(x) for x in train_token))

train_pad = pad_sequences(train_token,maxlen = max_length, truncating = 'post',padding = "post")
test_pad = pad_sequences(test_token,maxlen = max_length,truncating = 'post',padding = "post")

In [13]:
# Train and Test Data

x_train, x_test, y_train, y_test = train_pad, test_pad, train["Label"], test["Label"]

print(f" Length of train set : {len(y_train)}")
print(f" Length of test set : {len(y_test)}")

 Length of train set : 957
 Length of test set : 125


# KNN

In [14]:
clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(x_train, y_train)

y_pred = clf_knn.predict(x_test)
train_pred = clf_knn.predict(x_train)

print(f"Test accuracy is {accuracy_score(y_test, y_pred)}")
print(f"Train accuracy is {accuracy_score(y_train, train_pred)}")

Test accuracy is 0.568
Train accuracy is 0.9143155694879833


# Decision Tree

In [15]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(x_train, y_train)

y_pred = clf_tree.predict(x_test)
train_pred = clf_tree.predict(x_train)

print(f"Test accuracy of Decision tree is : {accuracy_score(y_test, y_pred)}")
print(f"Train accuracy of Decision tree is : {accuracy_score(y_train, train_pred)}")

Test accuracy of Decision tree is : 0.64
Train accuracy of Decision tree is : 1.0


# SVM

In [16]:
clf_svm = SVC()
clf_svm.fit(x_train, y_train)

y_pred = clf_svm.predict(x_test)
train_pred = clf_svm.predict(x_train)

print(f"Test accuracy is {accuracy_score(y_test, y_pred)}")
print(f"Train accuracy is {accuracy_score(y_train, train_pred)}")

Test accuracy is 0.464
Train accuracy is 0.9122257053291536


# End