In [1]:
from string import punctuation
from collections import Counter
from operator import itemgetter
from textstat.textstat import textstatistics,legacy_round
import re
import numpy as np
import pandas as pd
import statistics

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn import metrics

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


import spacy
import string

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_new_line = "".join([s for s in rem_num.strip().splitlines(True) if s.strip("\r\n").strip()])
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_new_line)  
    filtered_words = [w for w in tokens]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

In [None]:
def content_specific_feature(spacy_doc):
    sents = list(spacy_doc.sents)
    words = []
    
    num_words = 0
    num_chr = 0
    num_long_words = 0
    num_stop_words = 0
    num_not_vocab_word = 0
    highest_frequency = 0
    
    TTR = 0
    hapaxes = []
    HTR = 0
    
    for token in spacy_doc:
        if token.is_alpha:
            num_words+=1
            num_chr+=len(str(token))
            if len(token) > 5:
                num_long_words+=1
            if not token.vocab:
                num_not_vocab_word+=1
            if token.is_stop:
                num_stop_words+=1
            words.append(str(token))
            
    avg_num_chr_word = num_chr/num_words if num_words>0 else 0
    if words:
        TTR = len(set(words))/len(words)
        hapaxes = list(filter(lambda x: words.count(x) == 1, words))
        HTR = len(hapaxes)/ len(words)
        dec_word_frequencies = sorted(Counter(words).items(), key=lambda x:x[1], reverse=True)
        highest_frequency = dec_word_frequencies[0][1]
        
    num_sents = len(sents)
    avg_num_word_sen = sum([len(sent) for sent in sents])/num_sents
    
    return pd.Series({"num_sents" : num_sents,
                      "avg_num_word_sen" : avg_num_word_sen,
                      "num_words" : num_words,
                      "num_long_words" : num_long_words,
                      "num_stop_words" : num_stop_words,
                      "num_not_vocab_word" : num_not_vocab_word,
                      "num_chr" : num_chr,
                      "avg_num_chr_word": avg_num_chr_word,
                      "TTR" : TTR,
                      "HTR" : HTR,
                      "highest_frequency" : highest_frequency
                     })

def punctuation_specific_feature(spacy_doc):
    num_com_email = 0
    num_dot_email = 0
    num_exc_email = 0
    num_que_email = 0
    num_col_email = 0
    num_semi_col_email = 0
    
    words = list(map(str,spacy_doc))
    for word in words:
        if word == ',':
            num_com_email+=1
        elif word == '.':
            num_dot_email+=1
        elif word == '!':
            num_exc_email+=1
        elif word == '?':
            num_que_email+=1
        elif word == ':':
            num_col_email+=1
        elif word == ';':
            num_semi_col_email+=1
    return pd.Series({"num_com_email" : num_com_email,
                      "num_dot_email" : num_dot_email,
                      "num_exc_email" : num_exc_email,
                      "num_que_email" : num_que_email,
                      "num_col_email" : num_col_email,
                      "num_semi_col_email" : num_semi_col_email
                     })

def syntactic_specific_feature(spacy_doc):
    num_func_words = 0
    avg_verb_email = 0
    num_pos_email = 0
    pos_tags = [token.pos_ for token in spacy_doc]
    for tag in pos_tags:
        if tag in ["PRON", "DET", "ADP", "CONJ", "AUX"]:
            num_func_words+=1
    avg_verb_email = pos_tags.count("VERB")/len(spacy_doc)
    num_pos_email = len(set(pos_tags))
    return pd.Series({"num_func_words" : num_func_words,
                      "avg_verb_email" : avg_verb_email,
                      "num_pos_email" : num_pos_email
                     }) 

def semantic_specific_feature(spacy_doc, raw_doc):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(raw_doc)
    num_pos_word = 0
    num_neg_word = 0
    num_named_entity = 0
    
    for token in spacy_doc:
        token_pol_score = sid.polarity_scores(str(token))['compound']
        if token_pol_score >= 0.5:
            num_pos_word+=1
        elif token_pol_score <= -0.5:
            num_neg_word+=1
        if token.ent_type_ !="":
            num_named_entity+=1
    return pd.Series({"num_pos_word" : num_pos_word,
                      "num_neg_word" : num_neg_word,
                      "num_named_entity" : num_named_entity,
                      "polarity_score" : polarity_scores['compound']
                     })

def syllables_count(word):
    return textstatistics().syllable_count(word)

def difficult_words(spacy_doc):
    sents = list(spacy_doc.sents)
    words = []
    for sent in sents:
        words += [str(token) for token in sent]
    
    diff_words = set()
    for word in words:
        syllable_count = syllables_count(word)
        if word not in nlp.Defaults.stop_words and syllable_count >= 2:
            diff_words.add(word)
    return len(diff_words)

def dale_chall_index(spacy_doc):
    word_count = len(spacy_doc)
    avg_sentence_length = word_count/len(list(spacy_doc.sents))
    count = word_count - difficult_words(spacy_doc)
    if word_count>0:
        per = float(count) / float(word_count) * 100
    diff_words = 100-per
    score = (0.1579 * diff_words) + \
                (0.0496 * avg_sentence_length)
    if diff_words > 5:
        score += 3.6365
    return legacy_round(score,2)


def smog_index(spacy_doc):
    sent_count = len(list(spacy_doc.sents))
    poly_syllable_count = 0
    for word in list(map(str,spacy_doc)):
        syllable_count = syllables_count(word)
        if syllable_count >= 3:
            poly_syllable_count+=1
    if sent_count >= 3:
        SMOG = (1.043 * (30*(poly_syllable_count / sent_count))**0.5) \
                + 3.1291
        return legacy_round(SMOG, 1)
    else:
        return 0
    

def flesch_reading_index(spacy_doc):
    avg_sentence_length = len(spacy_doc)/len(list(spacy_doc.sents))
    avg_syllables_per_word = sum([syllables_count(word) for word in list(map(str,spacy_doc))])/len(spacy_doc)
    FRE = 206.835 - float(1.015 * avg_sentence_length) -\
          float(84.6 * avg_syllables_per_word)
    return legacy_round(FRE, 2)

    
def readability_specific_features(spacy_doc):
    return pd.Series({
        "smog_idx" : smog_index(spacy_doc),
        "dale_chall_idx" : dale_chall_index(spacy_doc),
        "flesch_idx" : flesch_reading_index(spacy_doc)
    })

In [None]:
emails_df = pd.read_csv('kaggle_enron_email_cleaned.csv')
filtered_df = emails_df.groupby('sender').filter(lambda g: g.count().gt(2500).any())
grp_df = filtered_df.groupby('sender')
filtered_author_list = [key for key, item in grp_df]
filtered_author_count = [grp_df.get_group(key).count() for key, item in grp_df]
filtered_df['content'] = filtered_df['email_body'].map(lambda s:preprocess(s)) 
filtered_df['sender'] = filtered_df['sender'].astype('category')
filtered_df['author'] = filtered_df['sender'].cat.codes
filtered_df.drop('email_body', axis=1, inplace=True)
filtered_df.drop('sender', axis=1, inplace=True)
filtered_df.drop('file', axis=1, inplace=True)
filtered_df.drop('valid', axis=1, inplace=True)
filtered_df.drop('Unnamed: 0', axis=1, inplace=True)
filtered_df.drop('sender_email', axis=1, inplace=True)
filtered_df["email_body"] = filtered_df["content"].apply(lambda x: nlp(x))
filtered_df.replace("", float("NaN"), inplace=True)
filtered_df.dropna(subset = ["content"], inplace=True)

In [None]:
filtered_df = filtered_df.join(filtered_df.email_body.apply(readability_specific_features))
filtered_df = filtered_df.join(filtered_df.email_body.apply(syntactic_specific_feature))
filtered_df = filtered_df.join(filtered_df.apply(lambda x : semantic_specific_feature(x["email_body"], x["content"]), axis=1))
filtered_df = filtered_df.join(filtered_df.email_body.apply(punctuation_specific_feature))
filtered_df = filtered_df.join(filtered_df.email_body.apply(content_specific_feature))

In [None]:
filtered_df.drop('email_body', axis=1, inplace=True)
filtered_df_copy = filtered_df.copy()
features = list(filtered_df.columns)
features.remove("content")
features.remove("author")

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scalar = StandardScaler()
standardized_features = pd.DataFrame(scalar.fit_transform(filtered_df[features].copy()), columns = features)
standardized_features["author"] = filtered_df["author"].values

In [None]:
X = standardized_features.iloc[:,standardized_features.columns != 'author']
y = standardized_features.author
pca = PCA()
x_new = pca.fit_transform(X)

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()

In [None]:
X = standardized_features.iloc[:,standardized_features.columns != 'author']
y = standardized_features.author
model = PCA(n_components=15).fit(X)
X_pc = model.transform(X)
n_pcs= model.components_.shape[0]
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = list(X.columns)
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
print(most_important_names)

In [None]:
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
df = pd.DataFrame(dic.items())
print(df)

In [None]:
pca_features = standardized_features[most_important_names].copy()
pca_features = pca_features.loc[:,~pca_features.columns.duplicated()]
pca_features["content"] = filtered_df["content"].values
pca_features["author"] = filtered_df["author"].values
pca_features["content"] = pca_features["content"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [None]:
pca_features_tfidf = pca_features.copy()
pca_features_cv = pca_features.copy()

In [None]:
tfidf_vec = TfidfVectorizer()
tfidf_dense = tfidf_vec.fit_transform(pca_features_tfidf['content']).todense()
new_cols = tfidf_vec.get_feature_names_out()
pca_features_tfidf = pca_features_tfidf.drop('author',axis=1)
pca_features_tfidf = pca_features_tfidf.drop('content',axis=1)
pca_features_tfidf = pca_features_tfidf.join(pd.DataFrame(tfidf_dense, columns=new_cols))

In [None]:
rf_tfidf = Pipeline([("random forest",
                         RandomForestClassifier(n_estimators=100))   
])

svc_tfidf = Pipeline([
               ("linear svc", 
                SVC(kernel="linear", probability=True))])

X = pca_features.iloc[:,pca_features.columns != 'author']
X_train, X_test, y_train, y_test = train_test_split(X, filtered_df.author,
                                                    stratify=filtered_df.author, 
                                                    test_size=0.20)
rf_tfidf.fit(X_train, y_train)
print('Training set score: ' + str(rf_tfidf.score(X_train,y_train)))
print('Test set score: ' + str(rf_tfidf.score(X_test,y_test)))

svc_tfidf.fit(X_train, y_train)
print('Training set score: ' + str(svc_tfidf.score(X_train,y_train)))
print('Test set score: ' + str(svc_tfidf.score(X_test,y_test)))

In [None]:
rf_cv = Pipeline([("random forest",
                         RandomForestClassifier(n_estimators=100))   
])

svc_cv = Pipeline([('vect', CountVectorizer()),
               ("linear svc", 
                SVC(kernel="linear", probability=True))])

X = pca_features_cv.iloc[:,pca_features_cv.columns != 'author']
X_train, X_test, y_train, y_test = train_test_split(X, filtered_df.author,
                                                    stratify=filtered_df.author, 
                                                    test_size=0.20)
preproc = ColumnTransformer(
    [('text_vect', CountVectorizer(), 'content')],
    remainder='passthrough',
)
X_train_preproc = preproc.fit_transform(X_train)
X_test_preproc = preproc.transform(X_test)

rf_cv.fit(X_train_preproc, y_train)
print('Training set score: ' + str(rf_cv.score(X_train_preproc,y_train)))
print('Test set score: ' + str(rf_cv.score(X_test_preproc,y_test)))

svc_cv.fit(X_train_preproc, y_train)
print('Training set score: ' + str(svc_cv.score(X_train_preproc,y_train)))
print('Test set score: ' + str(svc_cv.score(X_test_preproc,y_test)))