In [None]:
import numpy as np
import pandas as pd
import multiprocessing
import seaborn as sns
import email
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("archive (6)/emails.csv")


In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(df.loc[1]['message'])


In [None]:
from email import message_from_string
message = df.loc[1]['message']
a = message_from_string(message)

a.items()


In [None]:
a.get('Date')


In [None]:
a.get_payload()


In [None]:
def get_field(field, messages):
    return [email.message_from_string(message).get(field) for message in messages]


In [None]:
df['date'] = get_field("Date", df['message'])
df['subject'] = get_field("Subject", df['message'])
df['X-Folder'] = get_field("X-Folder", df['message'])
df['X-From'] = get_field("X-From", df['message'])
df['X-To'] = get_field("X-To", df['message'])
df.head(3)

In [None]:
df['body'] = [email.message_from_string(msg).get_payload() for msg in df['message']]
df.head(3)


In [None]:
df['file'][:10]

In [None]:
df['employee'] = [path.split("/")[0] for path in df['file']]
df.head(3)


In [None]:
unique_emails = df['X-Folder'].value_counts().head(20).reset_index()
unique_emails.columns = ['folder_name', 'count']


In [None]:
from dateutil import parser
def change_type(dates):
    return [parser.parse(date).strftime("%d-%m-%Y %H:%M:%S") for date in dates]


df['date'] = change_type(df['date'])
df.head(2)

In [None]:
def preprocess_folder(folders):
    return [np.nan if folder is None or folder == "" else folder.split("\\")[-1].lower() for folder in folders]

df['X-Folder'] = preprocess_folder(df['X-Folder'])
df.head(2)

In [None]:
print("Unique Foldes: ", len(df['X-Folder'].unique()))
df['X-Folder'].unique()[0:20]

In [None]:
def replace_empty_with_nan(subject):
    return [np.nan if val == "" else val for val in subject]


In [None]:
df['subject'] = replace_empty_with_nan(df['subject'])
df.head(2)


In [None]:
df.isnull().sum()


In [None]:
df.dropna(axis=0, inplace=True)


In [None]:
df.isnull().sum(), df.shape


In [None]:
df.head(3)


In [None]:
cols_to_drop = ['file','message','date','X-From','X-To']


In [None]:
df.drop(cols_to_drop, axis=1, inplace=True)


In [None]:
df.head()


In [None]:
import matplotlib.pyplot as plt
import re
import string
import time
pd.set_option('display.max_rows', 50)
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
def remove_folders(emails, n):
    email_count = df['X-Folder'].value_counts()
    folders_to_keep = email_count[email_count > n].index
    return df[df['X-Folder'].isin(folders_to_keep)]


In [None]:
n = 150
df = remove_folders(df, n)

In [None]:
df['text'] = df['subject'] + " " + df['body']

In [None]:
df.drop(['subject','body'], axis=1, inplace=True)


In [None]:
def preprocess(x):
    x = x.lower()
    x = re.sub(r'\n+', ' ', x)
    x = re.sub("["+string.punctuation+"]", " ", x)
    x = re.sub(r'\s+', ' ', x)
    
    return x

In [None]:
start = time.time()
df.loc[:,'text'] = df.loc[:, 'text'].map(preprocess)

# remove stopwords
df.loc[:, 'text'] = df.loc[:, 'text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
end = time.time()
print("Execution time (sec): ",(end - start))

In [None]:
start_time = time.time()

email_count_dict = dict(df['X-Folder'].value_counts().sort_values()[50:70])
selected_emails = df[df['X-Folder'].isin(email_count_dict.keys())]
end_time = time.time()
execution_time = end_time - start_time
print("Execution time (sec): ", execution_time)

In [None]:
df.to_csv('preprocessed.csv', index=False)

In [None]:
data = pd.read_csv("preprocessed.csv")


In [None]:
data['X-Folder'].value_counts()


In [None]:
def label_encoder(data):
    class_le = LabelEncoder()
    y = class_le.fit_transform(data['X-Folder'])
    return y

In [None]:
cols_to_drop = ['employee']

data.drop(cols_to_drop, axis=1, inplace=True)


In [None]:
data.head()

In [None]:
y = label_encoder(data)
input_data = data['text']
input_data = input_data.fillna('')

In [None]:
start = time.time()
vectorizer = CountVectorizer(min_df=5, max_features=5000)
X = vectorizer.fit_transform(input_data)
end = time.time()
print("Execution time (sec): ",(end - start))

In [None]:
start = time.time()
X = X.toarray()
print("X.shape: ",X.shape)
end = time.time()
print("Execution time (sec): ",(end - start))

In [None]:
f1_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM'],
    'BoW': ''
}
f1_df = pd.DataFrame(f1_data)

jaccard_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB', 'Decision Tree','SVM'],
    'BoW': ''
}
jacc_df = pd.DataFrame(jaccard_data)

acc_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM'],
    'BoW': ''
}
acc_df = pd.DataFrame(acc_data)
acc_df.head()

In [None]:
models = [GaussianNB(), MultinomialNB(), DecisionTreeClassifier(), LinearSVC()]

names = ["Gaussian NB", "Multinomial NB", "Decision Tree", "SVM"]

acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models, names):
    print(name)
    start = time.time()
    scoring = {
        'acc': 'accuracy',
        'f1_mac': 'f1_macro',
    }
    try:
        scores = cross_validate(model, X.toarray(), y, cv=10, n_jobs=4, scoring=scoring)
        training_time = (time.time() - start)
        print("accuracy: ", scores['test_acc'].mean())
        print("f1_score: ", scores['test_f1_mac'].mean())
        print("time (sec): ", training_time)
        print("\n")
         
        acc_scores.append(scores['test_acc'].mean())
        f1_scores.append(scores['test_f1_mac'].mean())
        exec_times.append(training_time)
    except TypeError as e:
        print("Error:", e)
        continue
    
acc_df['BoW'] = acc_scores
f1_df['BoW'] = f1_scores
acc_df['time'] = exec_times


In [None]:
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)


In [None]:
start = time.time()
vectorizer = CountVectorizer(min_df=5, max_features=5000, ngram_range=(2,2))
X = vectorizer.fit_transform(input_data)

X = X.toarray()
print("X.shape: ",X.shape)

end = time.time()
print("Execution time (sec): ",(end - start))

In [None]:
!pip uninstall gensim


In [None]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, LSTM, Embedding, Input, Conv1D, MaxPooling1D
from keras.layers import Concatenate, Input, Dense
from keras.layers import Dropout
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import EarlyStopping

from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [None]:
filename = "archive (7)/GoogleNews-vectors-negative300.bin"

start_time = time.monotonic()
google_embeddings = KeyedVectors.load_word2vec_format(filename, binary=True)
end_time = time.monotonic()

load_time = end_time - start_time
print(f"Load time (seconds): {load_time:.2f}")

In [None]:
glove_file = "archive (7)/glove.6B.300d.txt"
glove_word2vec_file = "glove.6B.300d.txt.word2vec"

glove2word2vec(glove_file, glove_word2vec_file)

In [None]:
start = time.time()

glove_embeddings = KeyedVectors.load_word2vec_format(glove_word2vec_file, binary=False)

print("Load time (seconds): ", (time.time() - start))

In [None]:
df = pd.read_csv("preprocessed.csv")
df.head()

In [None]:
def label_encoder(df):
    class_le = LabelEncoder()
    y = class_le.fit_transform(df['X-Folder'])
    return y

In [None]:
y = label_encoder(df)
corpus = df['text']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, random_state=0)

In [None]:
all_words = []

for sent in corpus:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

In [None]:
unique_words = set(all_words)
print("Unique words: ",len(unique_words))

In [None]:
t = Tokenizer()
t.fit_on_texts(corpus)
train_encoded_docs = t.texts_to_sequences(X_train)
test_encoded_docs = t.texts_to_sequences(X_test)

In [None]:
word_count = lambda doc: len(word_tokenize(doc))
longest_doc = max(corpus, key=word_count)
length_longest_doc = len(word_tokenize(longest_doc))
length_longest_doc = 500

In [None]:
train_padded_docs = pad_sequences(train_encoded_docs, length_longest_doc, padding='post')
test_padded_docs = pad_sequences(test_encoded_docs, length_longest_doc, padding='post')

In [None]:
Y_train = to_categorical(y_train, 20)
Y_test = to_categorical(y_test, 20)

In [None]:
docs = []

for doc in corpus:
    li = list(doc.split())
    docs.append(li)

In [None]:
start = time.time()
model = Word2Vec(docs, vector_size=300, window=5, min_count=1, workers=4, sg=0)
print(model)
model.save("email_embeddings.bin")

print("Training time (seconds): ", (time.time() - start))

In [None]:
start = time.time()

filename = "email_embeddings.bin"

email_embeddings = Word2Vec.load(filename)

print("Load time (seconds): ", (time.time() - start))

In [None]:
vocab_size = len(email_embeddings.wv.index_to_key)

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
count = 0

for word, i in t.word_index.items():
    if word in google_embeddings.index_to_key:
        embedding_vector = google_embeddings[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    elif word in email_embeddings.wv.key_to_index:
        embedding_vector = email_embeddings.wv.get_vector(word) # use get_vector() method
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    else: 
        count += 1 

In [None]:
embedding_matrix.shape


In [None]:
print("Number of words not present in email_embeddings: ", count)


In [None]:
from tensorflow.keras.layers import Concatenate


In [None]:
def define_model(length_longest_doc, vocab_size, embedding_size):
    # channel 1
    inputs1 = Input(shape=(length_longest_doc,))
    embedding1 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)

    # channel 2
    inputs2 = Input(shape=(length_longest_doc,))
    embedding2 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)

    # channel 3
    inputs3 = Input(shape=(length_longest_doc,))
    embedding3 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)

    # merge
    merged = layers.concatenate([flat1, flat2, flat3])
    
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(20, activation='softmax')(dense1)

    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)

    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # print model summary
    model.summary()
    plot_model(model, show_shapes=True, to_file='model.png')
    return model

In [None]:
def define_model_b(length_longest_doc, vocab_size, embedding_size):
    # channel 1
    inputs1 = Input(shape=(length_longest_doc,))
    embedding1 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(inputs1)
    lstm1_a = LSTM(256, dropout=0.5, return_sequences=True)(embedding1)
    lstm1_b = LSTM(128, dropout=0.5)(lstm1_a)
    flat1 = Flatten()(lstm1_b)
    
    # channel 2
    inputs2 = Input(shape=(length_longest_doc,))
    embedding2 = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(inputs2)
    lstm2_a = LSTM(256, dropout=0.5, return_sequences=True)(embedding2)
    lstm2_b = LSTM(128, dropout=0.5)(lstm2_a)
    flat2 = Flatten()(lstm2_b)
    
    # merge
    merge = concatenate([flat1, flat2])
    
    # interpretation
    #dense1 = Dense(10, activation='relu')(merge)
    outputs = Dense(20, activation='softmax')(merge)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # print model summary
    model.summary()
    #plot_model(model, show_shapes=True, to_file='model.png')
    return model

In [None]:
from tensorflow.keras.layers import concatenate


In [None]:
start = time.time()

model = define_model_b(length_longest_doc, vocab_size, 300)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4, min_delta=0.001)

history = model.fit([train_padded_docs, train_padded_docs], np.array(Y_train), epochs=50, batch_size=16, validation_split=0.1, callbacks=[es])

model.save("model1.h5")
print("Training time (minutes): ", (round((time.time() - start)/60, 2)))

In [None]:
model2_train_eval = model2.evaluate([train_padded_docs,train_padded_docs,train_padded_docs], np.array(Y_train), verbose=0)
model2_test_eval = model2.evaluate([test_padded_docs, test_padded_docs, test_padded_docs], np.array(Y_test), verbose=0)

print("Train Accuracy: {:0.3f}    Loss: {:0.3f}".format(model2_train_eval[1], model2_train_eval[0]))
print("Test Accuracy:  {:0.3f}    Loss: {:0.3f}".format(model2_test_eval[1], model2_test_eval[0]))