In [2]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
import keras
from keras.preprocessing import text
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, SimpleRNN, LeakyReLU
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gunva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gunva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
cols = ['mail', 'label']
df = pd.read_csv('main_dataset.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,Business terms and conditions to continue our ...,2,
1,I'm sorry that I couldn't get to your birthday...,1,
2,Your Amazon.in order #405 of 1 item has been d...,5,
3,We thought you'd like to know that we've dispa...,5,
4,"For the secure delivery of your order, it will...",5,


In [5]:
# Converting data into string because data can be in different datatypes like integer, float, object, string, etc
def convert_to_string(mail):
  return mail.astype(str)
df["mail"] = convert_to_string(df['mail'])

In [6]:
# Converting strings to lowercase
def lowerize(mail):
  return mail.str.lower()
df["mail"] = lowerize(df["mail"].str.lower())
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our ...,2,
1,i'm sorry that i couldn't get to your birthday...,1,
2,your amazon.in order #405 of 1 item has been d...,5,
3,we thought you'd like to know that we've dispa...,5,
4,"for the secure delivery of your order, it will...",5,


In [7]:
# Removing name with symbol @
def remove_name(mail):
    symbol = "@"
    new_list = []
    for sentence in mail:
      new_sentence = ''
      for i in sentence.split():
        if symbol in i:
          new_sentence += ''
        else:
          new_sentence += ' ' + i
      new_list.append(new_sentence)
    return new_list
df['mail'] = remove_name(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our...,2,
1,i'm sorry that i couldn't get to your birthda...,1,
2,your amazon.in order #405 of 1 item has been ...,5,
3,we thought you'd like to know that we've disp...,5,
4,"for the secure delivery of your order, it wil...",5,


In [8]:
# Removing hashtags with the symbol #
def remove_topic(mail):
    symbol = "#"
    new_list = []
    for sentence in mail:
      new_sentence = ''
      for i in sentence.split():
        if symbol in i:
          new_sentence += ''
        else:
          new_sentence += ' ' + i
      new_list.append(new_sentence)
    return new_list
df['mail'] = remove_topic(df['mail'])
#for i in range(len(df['mail'])):
#  if '#' in df['mail'][i]:
#    print(i, df['mail'][i])
#    break
df.head()
#remove_topic([df['mail'][83]])

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our...,2,
1,i'm sorry that i couldn't get to your birthda...,1,
2,your amazon.in order of 1 item has been dispa...,5,
3,we thought you'd like to know that we've disp...,5,
4,"for the secure delivery of your order, it wil...",5,


In [9]:
# Removing urls
def remove_url(mail):
  new_list = []
  for sentence in mail:
    new_list.append(re.sub(r"http\S+", "", sentence))
  return new_list
df['mail'] = remove_url(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our...,2,
1,i'm sorry that i couldn't get to your birthda...,1,
2,your amazon.in order of 1 item has been dispa...,5,
3,we thought you'd like to know that we've disp...,5,
4,"for the secure delivery of your order, it wil...",5,


In [10]:
# Removing special symbols
def remove_symbols(mail):
    symbols = "!\"$%&()*+-./:;<=>?[\]^_`{|}~\n'"
    new_list = []
    for sentence in mail:
      for i in symbols:
        sentence = sentence.replace(i, '')
      new_list.append(sentence)
    return new_list
df['mail'] = remove_symbols(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our...,2,
1,im sorry that i couldnt get to your birthday ...,1,
2,your amazonin order of 1 item has been dispat...,5,
3,we thought youd like to know that weve dispat...,5,
4,"for the secure delivery of your order, it wil...",5,


In [11]:
# Removing empty mails
def remove_empty_mail(df):
  cnt = df.shape[0]
  for i in range(cnt):
    if df['mail'][i] == '':
      df = df.drop(i)
  return df
df = remove_empty_mail(df)
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms and conditions to continue our...,2,
1,im sorry that i couldnt get to your birthday ...,1,
2,your amazonin order of 1 item has been dispat...,5,
3,we thought youd like to know that weve dispat...,5,
4,"for the secure delivery of your order, it wil...",5,


In [12]:
# Removing stopwords
stop_words = stopwords.words('english')  
def remove_stopwords(mail):
  new_list = []
  for sentence in mail:
    new_text = ""
    for word in sentence.split():
        if word not in stop_words:
            new_text = new_text + " " + word
    new_list.append(new_text.strip())
  return new_list
df['mail'] = remove_stopwords(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms conditions continue deal,2,
1,im sorry couldnt get birthday party,1,
2,amazonin order 1 item dispatched,5,
3,thought youd like know weve dispatched items o...,5,
4,"secure delivery order, delivered secure, tampe...",5,


In [13]:
# Removing emoticons
def remove_emoticon(mail):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    new_list = []
#    print(mail)
    for sentence in mail:
      new_list.append(regrex_pattern.sub(r'',str(sentence)))
    return new_list
df['mail'] = remove_emoticon(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business terms conditions continue deal,2,
1,im sorry couldnt get birthday party,1,
2,amazonin order 1 item dispatched,5,
3,thought youd like know weve dispatched items o...,5,
4,"secure delivery order, delivered secure, tampe...",5,


In [14]:
# lemmatization
def perform_lemmatization(mail):
  lemmatizer = WordNetLemmatizer()
  new_list = []
  for sentence in mail: 
    new_text = ""
    for w in sentence.split():
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    new_list.append(new_text.strip())
  return new_list
df['mail'] = perform_lemmatization(df['mail'])
df.head()

Unnamed: 0,mail,label,Unnamed: 2
0,business term condition continue deal,2,
1,im sorry couldnt get birthday party,1,
2,amazonin order 1 item dispatched,5,
3,thought youd like know weve dispatched item or...,5,
4,"secure delivery order, delivered secure, tampe...",5,


## appropriate label for target

In [15]:
# Checking null values
df.isna().sum()

mail             0
label            0
Unnamed: 2    1394
dtype: int64

In [16]:
# Counting label values
df['label'].value_counts()

0    756
1    157
2    153
3    120
5    111
4     97
Name: label, dtype: int64

In [17]:
totalwords = df['mail'].str.split()

In [18]:
# Creating list of count of words in the mail
totalwords = df['mail'].str.split().str.len()

In [19]:
# Finding maximum mail length
max_word = max(totalwords)
max_word

138

## 4. Appropriately partition the set into Train, cv, and Test

In [20]:
# Splitting data into X and y
X = df['mail']
#y = df['label']

In [21]:
y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', y.shape)

Shape of label tensor: (1394, 6)


In [22]:
# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 24, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1115,) (1115, 6)
(279,) (279, 6)


## Use Keras Embedding word2Vec

In [23]:
# Generating vocabulary of words
documents = [_text.split() for _text in X_train] 

In [24]:
# Initializing word2vec model
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=300, window=7, min_count=3, workers=8)

In [25]:
# Building vocabulary
w2v_model.build_vocab(documents)

## Process all the text and create a unique vocabulary set.

In [26]:
# Checking vocabulary size
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 1392


In [27]:
# Training word2vec model
w2v_model.train(documents, total_examples=len(documents), epochs=32)

(417238, 604288)

In [28]:
# Initializing tokenizer and fitting mail
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 4587


In [29]:
# Converting mail to sequences and padding it
x_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_word)
x_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_word)

In [30]:
# Reshape lebel data
#y_train = y_train.values.reshape(-1,1)
#y_test = y_test.reshape((-1,6))

In [31]:
# Checking shape of data
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (1115, 138)
y_train (1115, 6)
x_test (279, 138)
y_test (279, 6)


In [32]:
# Creating embedding matrix to use for words relation
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(4587, 300)


## Create the LSTM based model

In [33]:
# Callback to stop model training early
callbacks = [EarlyStopping(monitor='val_accuracy', patience=2)]

In [48]:
# Creating model architecture
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_word))
model.add(Dropout(0.3))
model.add(LSTM(64))

model.add(Dropout(0.3)) #0.5 initial
model.add(Dense(128, activation='relu'))



model.add(Dropout(0.1))
model.add(Dense(6, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 138, 300)          1376100   
                                                                 
 dropout_10 (Dropout)        (None, 138, 300)          0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dropout_12 (Dropout)        (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 6)                

In [49]:
# Compiling model
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [50]:
# Training model
history = model.fit(x_train, y_train, batch_size=128, epochs=50, validation_split=0.2, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [52]:
model.save('emailclass.h5')

In [53]:
import pickle

In [54]:
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [55]:
# Checking model accuracy and loss
scoreR = model.evaluate(x_test, y_test)
print("ACCURACY:",scoreR[1])
print("LOSS:",scoreR[0])

ACCURACY: 0.856630802154541
LOSS: 0.7514393329620361
