In [1]:
from string import punctuation
from collections import Counter
from operator import itemgetter
from textstat.textstat import textstatistics,legacy_round
import re
import string
import numpy as np
import pandas as pd
import statistics

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer,SnowballStemmer
from nltk.corpus import stopwords
import re

import spacy

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

In [2]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_new_line = "".join([s for s in rem_num.strip().splitlines(True) if s.strip("\r\n").strip()])
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_new_line)  
    filtered_words = [w for w in tokens]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


In [3]:
emails_df = pd.read_csv('kaggle_enron_email_cleaned.csv')
filtered_df = emails_df.groupby('sender').filter(lambda g: g.count().gt(2500).any())
grp_df = filtered_df.groupby('sender')
filtered_author_list = [key for key, item in grp_df]
filtered_author_count = [grp_df.get_group(key).count() for key, item in grp_df]

In [4]:
filtered_df['content'] = filtered_df['email_body'].map(lambda s:preprocess(s)) 
filtered_df['sender'] = filtered_df['sender'].astype('category')
filtered_df['author'] = filtered_df['sender'].cat.codes
filtered_df.drop('email_body', axis=1, inplace=True)
filtered_df.drop('sender', axis=1, inplace=True)
filtered_df.drop('file', axis=1, inplace=True)
filtered_df.drop('valid', axis=1, inplace=True)
filtered_df.drop('Unnamed: 0', axis=1, inplace=True)
filtered_df.drop('sender_email', axis=1, inplace=True)
filtered_df.head(10)

Unnamed: 0,content,author
3340,for your viewing pleasure,0
3341,i think we are going to stay in town and meet ...,0
3342,i didn t go either today is legs and lower abs...,0
3343,that s good news about the location i ll have ...,0
3344,effective date of your current rotation curren...,0
3345,it only takes about minutes to get to highland...,0
3346,i m ready we could try to do it over president...,0
3347,someone i know just put in a bid on priceline ...,0
3348,here is some info on steamboat i haven t check...,0
3349,michael is a happily married man he would neve...,0


In [5]:
filtered_df["content"] = filtered_df["content"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [6]:
mnb_tfidf = Pipeline([("tfidf_vectorizer", 
                      TfidfVectorizer(min_df=5, ngram_range=(1,3))),
                      ('clf', MultinomialNB())])
rf_tfidf = Pipeline([("tfidf_vectorizer", 
                      TfidfVectorizer(ngram_range=(1,3))),
                        ("random forest",
                         RandomForestClassifier(n_estimators=100))   
])
svc_tfidf = Pipeline([("tfidf_vectorizer", 
                 TfidfVectorizer(min_df=5, ngram_range=(1,3))),
               ("linear svc", 
                SVC(kernel="linear", probability=True))])


rf_cv = Pipeline([('vect', CountVectorizer()),
                        ("random forest",
                         RandomForestClassifier(n_estimators=100))])  
mnb_cv = Pipeline([('vect', CountVectorizer()),
                      ('clf', MultinomialNB())])
svc_cv = Pipeline([('vect', CountVectorizer()),
               ("linear svc", 
                SVC(kernel="linear", probability=True))])

In [7]:
from sklearn.model_selection import cross_val_score
from tabulate import tabulate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

models = [
    ("random forest classifier", rf_tfidf),
    ("multinomial naive bayes", mnb_tfidf),
    ("support vector classifier", svc_tfidf),
]

scores =  [(name, cross_val_score(model, filtered_df.content, filtered_df.author, cv=skf).mean())
                for name, model in models]


print(tabulate(scores, floatfmt=".4f", headers=("model", "score")))

model                       score
------------------------  -------
random forest classifier   0.7039


In [8]:
X_train, X_test, y_train, y_test = train_test_split(filtered_df.content, filtered_df.author,
                                                    stratify=filtered_df.author, 
                                                    test_size=0.20)
rf_tfidf.fit(X_train, y_train)
print('Training set score: ' + str(rf_tfidf.score(X_train,y_train)))
print('Test set score: ' + str(rf_tfidf.score(X_test,y_test)))


mnb_tfidf.fit(X_train, y_train)
print('Training set score: ' + str(mnb_tfidf.score(X_train,y_train)))
print('Test set score: ' + str(mnb_tfidf.score(X_test,y_test)))


svc_tfidf.fit(X_train, y_train)
print('Training set score: ' + str(svc_tfidf.score(X_train,y_train)))
print('Test set score: ' + str(svc_tfidf.score(X_test,y_test)))


rf_cv.fit(X_train, y_train)
print('Training set score: ' + str(rf_cv.score(X_train,y_train)))
print('Test set score: ' + str(rf_cv.score(X_test,y_test)))


mnb_cv.fit(X_train, y_train)
print('Training set score: ' + str(mnb_cv.score(X_train,y_train)))
print('Test set score: ' + str(mnb_cv.score(X_test,y_test)))

svc_cv.fit(X_train, y_train)
print('Training set score: ' + str(svc_cv.score(X_train,y_train)))
print('Test set score: ' + str(svc_cv.score(X_test,y_test)))



Training set score: 0.9779110989910008
Test set score: 0.8320203611683432


In [9]:
from sklearn import metrics
y_pred = rf_tfidf.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81       538
           1       0.97      0.92      0.94       524
           2       0.81      0.73      0.77       961
           3       0.86      0.78      0.82       915
           4       0.86      0.57      0.68       662
           5       0.89      0.97      0.93      1185
           6       0.71      0.64      0.67       510
           7       0.72      0.96      0.82      1633
           8       0.92      0.80      0.85       800
           9       0.97      0.95      0.96       523

    accuracy                           0.83      8251
   macro avg       0.86      0.81      0.83      8251
weighted avg       0.84      0.83      0.83      8251

