Use text data, text mining and topic modeling to detect fraudulent behavior.

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

import gensim
from gensim import corpora

In [2]:
pd.set_option('display.max_columns', 700)
pd.set_option('display.max_rows', 400)
pd.set_option('display.min_rows', 10)
pd.set_option('display.expand_frame_repr', True)

data = Path.cwd() / 'data'
ch4 = data / 'dataset_4'
enron_emails_clean_file = ch4 / 'enron_emails_clean.csv'
cleantext_file = ch4 / 'cleantext.pickle'
corpus_file = ch4 / 'corpus.pickle'
dict_file = ch4 / 'dict.pickle'
ldamodel_file = ch4 / 'ldamodel.pickle'

In [3]:
df = pd.read_csv(enron_emails_clean_file)

In [5]:
# Using a string operator to find words
df['clean_content'].str.contains('money laundering')

 # Select data that matches 
df.loc[df['clean_content'].str.contains('money laundering', na=False)]

 # Create a list of words to search for
list_of_words = ['police', 'money laundering']
df.loc[df['clean_content'].str.contains('|'.join(list_of_words), na=False)]

 # Create a fraud flag 
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(list_of_words)) == True), 1, 0)

In [6]:
df = pd.read_csv(enron_emails_clean_file)

In [9]:
# Create a list of terms to search for
searchfor = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']

# Filter cleaned emails on searchfor list and select from df 
filtered_emails = df[df.clean_content.str.contains('|'.join(searchfor), na=False)]
filtered_emails.head()

Unnamed: 0,Message-ID,From,To,Date,content,clean_content,Unnamed: 6
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),2002/1/29 23:20,INVESTools Advisory\nA Free Digest of Trusted ...,investools advisory free digest trusted invest...,
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),2000/9/20 19:07,----- Forwarded by Richard B Sanders/HOU/ECT o...,forwarded richard b sanders hou ect pm justin ...,
2,<26118676.1075862176383.JavaMail.evans@thyme>,('m..love@enron.com'),('m..love@enron.com'),2001/10/30 16:15,hey you are not wearing your target purple shi...,hey wearing target purple shirt today mine wan...,
3,<10369289.1075860831062.JavaMail.evans@thyme>,('leslie.milosevich@kp.org'),('leslie.milosevich@kp.org'),2002/1/30 17:54,Leslie Milosevich\n1042 Santa Clara Avenue\nAl...,leslie milosevich santa clara avenue alameda c...,
4,<26728895.1075860815046.JavaMail.evans@thyme>,('rtwait@graphicaljazz.com'),('rtwait@graphicaljazz.com'),2002/1/30 19:36,"Rini Twait\n1010 E 5th Ave\nLongmont, CO 80501...",rini twait e th ave longmont co rtwait graphic...,


In [10]:
# Create flag variable where the emails match the searchfor terms
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(searchfor)) == True), 1, 0)

# Count the values of the flag variable
count = df['flag'].value_counts()
print(count)

0    13923
1      303
Name: flag, dtype: int64


sift through messy email data and search for the email with suspicious words

Data Preprocessing

In [11]:
from nltk import word_tokenize
from nltk.corpus import stopwords 
import string

# 1. Tokenization
text = df.apply(lambda row: word_tokenize(str(row["clean_content"])), axis=1)
text = df.apply(lambda row: str(row["clean_content"]).lower(), axis=1)
text = df.apply(lambda row: str(row["clean_content"]).rstrip(), axis=1)
###text = text.rstrip()  # remove whitespace
# # replace with lowercase
# # text = re.sub(r'[^a-zA-Z]', ' ', text)
# text = text.str.lower()

 # 2. Remove all stopwords and punctuation
exclude = set(string.punctuation)
stop = set(stopwords.words('english'))
stop_free = " ".join([word for word in text if((word not in stop) and (not word.isdigit()))])
punc_free = ''.join(word for word in stop_free if word not in exclude)

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Lemmatize words
lemma = WordNetLemmatizer()
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())

# Stem words
porter= PorterStemmer()
cleaned_text = " ".join(porter.stem(token) for token in normalized.split())

In [13]:
# Define stopwords to exclude
stop = set(stopwords.words('english'))
stop.update(("to", "cc", "subject", "http", "from", "sent", "ect", "u", "fwd", "www", "com", 'html'))

# Define punctuations to exclude and lemmatizer
exclude = set(string.punctuation)


In [14]:
# Import the lemmatizer from nltk
lemma = WordNetLemmatizer()

def clean(text, stop):
    text = str(text).rstrip()
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(i for i in stop_free if i not in exclude)
    normalized = " ".join(lemma.lemmatize(i) for i in punc_free.split())      
    return normalized

In [15]:
# Clean the emails in df and print results
text_clean=[]
for text in df['clean_content']:
    text_clean.append(clean(text, stop).split())    

In [16]:
text_clean[0][:10]

['investools',
 'advisory',
 'free',
 'digest',
 'trusted',
 'investment',
 'advice',
 'unsubscribe',
 'free',
 'newsletter']

Topic modeling on fraud --- using LDA model

In [17]:
# Define the dictionary
dictionary = corpora.Dictionary(text_clean)

# Define the corpus 
corpus = [dictionary.doc2bow(text) for text in text_clean]

In [18]:
# Define the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=5)

# Save the topics and top 5 words
topics = ldamodel.print_topics(num_words=5)

# Print the results
for topic in topics:
    print(topic)

(0, '0.012*"image" + 0.007*"market" + 0.005*"price" + 0.005*"new" + 0.005*"diabetes"')
(1, '0.598*"nan" + 0.011*"enronoptions" + 0.002*"ption" + 0.002*"typing" + 0.001*"salary"')
(2, '0.031*"enron" + 0.010*"please" + 0.010*"message" + 0.008*"pm" + 0.008*"e"')
(3, '0.035*"enron" + 0.031*"fund" + 0.023*"employee" + 0.022*"company" + 0.019*"energy"')
(4, '0.048*"enron" + 0.026*"hou" + 0.018*"td" + 0.015*"net" + 0.013*"deal"')


In [30]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=False)

  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping
  import imp
  from collections import Iterable
  from collections import Mapping


In [20]:
pyLDAvis.display(lda_display)

In [21]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df


contents = pd.DataFrame({'Original text':text_clean})
topic_details = pd.concat([get_topic_details(ldamodel,
                           corpus), contents], axis=1)
topic_details.head()


Unnamed: 0,Dominant_Topic,% Score,Original text
0,0.0,0.826967,"[investools, advisory, free, digest, trusted, ..."
1,2.0,0.838321,"[forwarded, richard, b, sander, hou, pm, justi..."
2,2.0,0.647142,"[hey, wearing, target, purple, shirt, today, m..."
3,3.0,0.993506,"[leslie, milosevich, santa, clara, avenue, ala..."
4,3.0,0.993381,"[rini, twait, e, th, ave, longmont, co, rtwait..."


In [22]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df

In [23]:
# Run get_topic_details function and check the results
topic_details_df = get_topic_details(ldamodel, corpus)

In [24]:
topic_details_df.head()

Unnamed: 0,Dominant_Topic,% Score
0,0.0,0.826967
1,2.0,0.838321
2,2.0,0.647141
3,3.0,0.993505
4,3.0,0.993381


In [25]:
topic_details_df.tail()


Unnamed: 0,Dominant_Topic,% Score
14221,0.0,0.642619
14222,4.0,0.598067
14223,0.0,0.974289
14224,0.0,0.985787
14225,2.0,0.776149


In [26]:
# Add original text to topic details in a dataframe
contents = pd.DataFrame({'Original text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)

In [27]:
topic_details.sort_values(by=['% Score'], ascending=False).head(10).head()

Unnamed: 0,Dominant_Topic,% Score,Original text
13390,2.0,0.99914,"[inline, attachment, follows, scasey, tfsbroke..."
13145,0.0,0.99906,"[start, date, hourahead, hour, hourahead, sche..."
7120,0.0,0.998952,"[dear, buy, giving, valued, customer, year, en..."
13335,2.0,0.99835,"[hey, thought, hope, noted, watching, shoemake..."
13239,2.0,0.998083,"[meet, elevator, original, message, maggi, mik..."


In [28]:
topic_details.sort_values(by=['% Score'], ascending=False).head(10).tail()

Unnamed: 0,Dominant_Topic,% Score,Original text
11964,2.0,0.997998,"[stephanie, based, upon, message, made, necess..."
10545,2.0,0.99779,"[got, ticket, thanks, sending, paid, ou, ticke..."
69,2.0,0.99774,"[late, meet, house, original, message, erin, r..."
58,2.0,0.997703,"[calling, fat, as, serious, restaurant, better..."
13728,4.0,0.997655,"[please, check, master, section, change, s, fo..."


In [29]:
# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 3.0), 1, 0)
topic_details_1 = topic_details[topic_details.flag == 1]
topic_details_1.sort_values(by=['% Score'], ascending=False).head(10)

Unnamed: 0,Dominant_Topic,% Score,Original text,flag
7048,3.0,0.993626,"[anne, walton, sandberg, pine, street, suite, ...",1
7053,3.0,0.993583,"[karsten, mueller, hubbard, st, santa, cruz, c...",1
54,3.0,0.993575,"[diego, baz, e, duarte, rd, san, gabriel, ca, ...",1
67,3.0,0.993557,"[cynthia, peterson, w, n, mt, du, lac, sussex,...",1
13083,3.0,0.993533,"[kathleen, corcoran, stinson, lake, road, rumn...",1
104,3.0,0.993526,"[david, gold, west, th, street, apt, c, new, y...",1
6101,3.0,0.993524,"[tiffiny, tung, th, st, nw, b, washington, dc,...",1
6367,3.0,0.993521,"[donna, cay, tharpe, wisconsin, avenue, nw, wa...",1
6135,3.0,0.99352,"[robert, zimmer, n, la, palmas, ave, los, ange...",1
43,3.0,0.993518,"[leila, salazar, mission, st, suite, san, fran...",1
