Use text data, text mining and topic modeling to detect fraudulent behavior.

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

import gensim
from gensim import corpora

In [2]:
pd.set_option('display.max_columns', 700)
pd.set_option('display.max_rows', 400)
pd.set_option('display.min_rows', 10)
pd.set_option('display.expand_frame_repr', True)

data = Path.cwd() / 'data'
ch4 = data / 'dataset_4'
enron_emails_clean_file = ch4 / 'enron_emails_clean.csv'
cleantext_file = ch4 / 'cleantext.pickle'
corpus_file = ch4 / 'corpus.pickle'
dict_file = ch4 / 'dict.pickle'
ldamodel_file = ch4 / 'ldamodel.pickle'

In [3]:
df = pd.read_csv(enron_emails_clean_file)

In [4]:
df['clean_content'].str.contains('money laundering')

0        False
1        False
2        False
3        False
4        False
         ...  
14221    False
14222    False
14223    False
14224    False
14225    False
Name: clean_content, Length: 14226, dtype: object

In [5]:
# Using a string operator to find words
df['clean_content'].str.contains('money laundering')

 # Select data that matches 
df.loc[df['clean_content'].str.contains('money laundering', na=False)]

 # Create a list of words to search for
list_of_words = ['police', 'money laundering']
df.loc[df['clean_content'].str.contains('|'.join(list_of_words), na=False)]

 # Create a fraud flag 
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(list_of_words)) == True), 1, 0)

In [6]:
df = pd.read_csv(enron_emails_clean_file)

In [7]:
mask = df['clean_content'].str.contains('sell enron stock', na=False)

In [8]:
# Select the data from df using the mask
df[mask]



Unnamed: 0,Message-ID,From,To,Date,content,clean_content,Unnamed: 6


In [9]:
# Create a list of terms to search for
searchfor = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']

# Filter cleaned emails on searchfor list and select from df 
filtered_emails = df[df.clean_content.str.contains('|'.join(searchfor), na=False)]
filtered_emails.head()

Unnamed: 0,Message-ID,From,To,Date,content,clean_content,Unnamed: 6
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),2002/1/29 23:20,INVESTools Advisory\nA Free Digest of Trusted ...,investools advisory free digest trusted invest...,
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),2000/9/20 19:07,----- Forwarded by Richard B Sanders/HOU/ECT o...,forwarded richard b sanders hou ect pm justin ...,
2,<26118676.1075862176383.JavaMail.evans@thyme>,('m..love@enron.com'),('m..love@enron.com'),2001/10/30 16:15,hey you are not wearing your target purple shi...,hey wearing target purple shirt today mine wan...,
3,<10369289.1075860831062.JavaMail.evans@thyme>,('leslie.milosevich@kp.org'),('leslie.milosevich@kp.org'),2002/1/30 17:54,Leslie Milosevich\n1042 Santa Clara Avenue\nAl...,leslie milosevich santa clara avenue alameda c...,
4,<26728895.1075860815046.JavaMail.evans@thyme>,('rtwait@graphicaljazz.com'),('rtwait@graphicaljazz.com'),2002/1/30 19:36,"Rini Twait\n1010 E 5th Ave\nLongmont, CO 80501...",rini twait e th ave longmont co rtwait graphic...,


In [10]:
# Create flag variable where the emails match the searchfor terms
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(searchfor)) == True), 1, 0)

# Count the values of the flag variable
count = df['flag'].value_counts()
print(count)

0    13923
1      303
Name: flag, dtype: int64


In [11]:
df["clean_content"]

0        investools advisory free digest trusted invest...
1        forwarded richard b sanders hou ect pm justin ...
2        hey wearing target purple shirt today mine wan...
3        leslie milosevich santa clara avenue alameda c...
4        rini twait e th ave longmont co rtwait graphic...
                               ...                        
14221                 bot apr oct put digital gas x thanks
14222                                                 okay
14223    image image image image image image image imag...
14224    transmission expansion systems transition conf...
14225    sorry nella replied ago directly rob milnthorp...
Name: clean_content, Length: 14226, dtype: object

sift through messy email data and search for the email with suspicious words

Data Preprocessing

In [12]:
from nltk import word_tokenize
from nltk.corpus import stopwords 
import string

# 1. Tokenization
text = df.apply(lambda row: word_tokenize(str(row["clean_content"])), axis=1)
text = df.apply(lambda row: str(row["clean_content"]).lower(), axis=1)
text = df.apply(lambda row: str(row["clean_content"]).rstrip(), axis=1)
###text = text.rstrip()  # remove whitespace
# # replace with lowercase
# # text = re.sub(r'[^a-zA-Z]', ' ', text)
# text = text.str.lower()

 # 2. Remove all stopwords and punctuation
exclude = set(string.punctuation)
stop = set(stopwords.words('english'))
stop_free = " ".join([word for word in text if((word not in stop) and (not word.isdigit()))])
punc_free = ''.join(word for word in stop_free if word not in exclude)

In [13]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Lemmatize words
lemma = WordNetLemmatizer()
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())

# Stem words
porter= PorterStemmer()
cleaned_text = " ".join(porter.stem(token) for token in normalized.split())
print (cleaned_text)

['philip','going','street','curious','hear','perspective','may','wish',
'offer','trading','floor','enron','stock','lower','joined','company',
'business','school','imagine','quite','happy','people','day','relate',
'somewhat','stock','around','fact','broke','day','ago','knowing',
'imagine','letting','event','get','much','taken','similar',
'problem','hope','everything','else','going','well','family','knee',
'surgery','yet','give','call','chance','later']

investool advisori free digest trust invest advic unsubscrib free newslett pleas see issu fri sell stock gain month km row januari index confirm bull market aloy small cap advisor earn lbix compound return pine tree pcl undervalu high yield bank put custom first aso word sponsor top wall street watcher ben zack year year gain move best brightest wall street big money machin earn ben zack five year averag annual gain start outperform long term get zack latest stock buylist free day trial http www investool com c go zak mtxtu zakstb investool advisori john brobst investool com fri sell stock lock month km david fri know stock undervalu compani manag buy back share open market latest triumph pocket impress gain three short month sell four buyback stock includ gain auto retail autom incorpor gain digit phone system purveyor inter tel intl fri recent move buy kmart corpor km beleagu discount retail declar bankruptci think k mart go busi fri say take recoveri possibl bought share anoth fri p

['philip',
 'going',
 'street',
 'curious',
 'hear',
 'perspective',
 'may',
 'wish',
 'offer',
 'trading',
 'floor',
 'enron',
 'stock',
 'lower',
 'joined',
 'company',
 'business',
 'school',
 'imagine',
 'quite',
 'happy',
 'people',
 'day',
 'relate',
 'somewhat',
 'stock',
 'around',
 'fact',
 'broke',
 'day',
 'ago',
 'knowing',
 'imagine',
 'letting',
 'event',
 'get',
 'much',
 'taken',
 'similar',
 'problem',
 'hope',
 'everything',
 'else',
 'going',
 'well',
 'family',
 'knee',
 'surgery',
 'yet',
 'give',
 'call',
 'chance',
 'later']

In [14]:
# Define stopwords to exclude
stop = set(stopwords.words('english'))
stop.update(("to", "cc", "subject", "http", "from", "sent", "ect", "u", "fwd", "www", "com", 'html'))

# Define punctuations to exclude and lemmatizer
exclude = set(string.punctuation)


In [15]:
# Import the lemmatizer from nltk
lemma = WordNetLemmatizer()

def clean(text, stop):
    text = str(text).rstrip()
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(i for i in stop_free if i not in exclude)
    normalized = " ".join(lemma.lemmatize(i) for i in punc_free.split())      
    return normalized

In [16]:
# Clean the emails in df and print results
text_clean=[]
for text in df['clean_content']:
    text_clean.append(clean(text, stop).split())    

In [17]:
text_clean[0][:10]

['investools',
 'advisory',
 'free',
 'digest',
 'trusted',
 'investment',
 'advice',
 'unsubscribe',
 'free',
 'newsletter']

Topic modeling on fraud --- using LDA model

In [18]:


# Define the dictionary
dictionary = corpora.Dictionary(text_clean)

# Define the corpus 
corpus = [dictionary.doc2bow(text) for text in text_clean]


corpus = [dictionary.doc2bow(text) for text in text_clean]

In [19]:
# Define the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=5)

# Save the topics and top 5 words
topics = ldamodel.print_topics(num_words=5)

# Print the results
for topic in topics:
    print(topic)

(0, '0.059*"enron" + 0.021*"hou" + 0.016*"pm" + 0.012*"message" + 0.009*"thanks"')
(1, '0.024*"image" + 0.021*"td" + 0.017*"net" + 0.015*"money" + 0.014*"tr"')
(2, '0.008*"enron" + 0.005*"market" + 0.005*"please" + 0.005*"would" + 0.004*"trading"')
(3, '0.609*"nan" + 0.000*"mediatrip" + 0.000*"flakin" + 0.000*"telkajdslkajfdlkasjdflkjsd" + 0.000*"yuour"')
(4, '0.053*"enron" + 0.030*"employee" + 0.026*"fund" + 0.024*"company" + 0.021*"stock"')


In [20]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=False)

  from collections import Iterable
  from collections import Mapping


In [21]:
pyLDAvis.display(lda_display)

In [22]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df


contents = pd.DataFrame({'Original text':text_clean})
topic_details = pd.concat([get_topic_details(ldamodel,
                           corpus), contents], axis=1)
topic_details.head()


Unnamed: 0,Dominant_Topic,% Score,Original text
0,2.0,0.714711,"[investools, advisory, free, digest, trusted, ..."
1,2.0,0.666654,"[forwarded, richard, b, sander, hou, pm, justi..."
2,0.0,0.74148,"[hey, wearing, target, purple, shirt, today, m..."
3,4.0,0.993516,"[leslie, milosevich, santa, clara, avenue, ala..."
4,4.0,0.993409,"[rini, twait, e, th, ave, longmont, co, rtwait..."


In [23]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df

In [24]:
# Run get_topic_details function and check the results
topic_details_df = get_topic_details(ldamodel, corpus)

In [25]:
topic_details_df.head()

Unnamed: 0,Dominant_Topic,% Score
0,2.0,0.71471
1,2.0,0.666653
2,0.0,0.741479
3,4.0,0.993515
4,4.0,0.993408


In [26]:
topic_details_df.tail()


Unnamed: 0,Dominant_Topic,% Score
14221,2.0,0.657855
14222,0.0,0.599531
14223,1.0,0.90745
14224,2.0,0.994074
14225,0.0,0.556969


In [27]:
# Add original text to topic details in a dataframe
contents = pd.DataFrame({'Original text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)

In [28]:
topic_details.sort_values(by=['% Score'], ascending=False).head(10).head()

Unnamed: 0,Dominant_Topic,% Score,Original text
14217,1.0,0.999631,"[unsubscribe, mailing, please, go, money, net,..."
11963,2.0,0.998482,"[december, notice, nymex, division, member, me..."
14064,0.0,0.998481,"[uh, yeah, whole, bunch, single, one, squishy,..."
10649,2.0,0.997923,"[intercontinentalexchange, firm, physical, nat..."
10372,0.0,0.997704,"[problem, best, jeff, glen, ha, enron, enronxg..."


In [29]:
topic_details.sort_values(by=['% Score'], ascending=False).head(10).tail()

Unnamed: 0,Dominant_Topic,% Score,Original text
13728,0.0,0.997668,"[please, check, master, section, change, s, fo..."
13430,2.0,0.997599,"[getting, ready, launch, netcoonline, preparin..."
13572,2.0,0.997587,"[original, message, owner, nyiso, tech, exchan..."
12024,0.0,0.997512,"[fyi, vince, forwarded, vince, j, kaminski, ho..."
13357,2.0,0.997484,"[original, message, shortridge, pat, wednesday..."


In [30]:
# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 3.0), 1, 0)
topic_details_1 = topic_details[topic_details.flag == 1]
topic_details_1.sort_values(by=['% Score'], ascending=False).head(10)

Unnamed: 0,Dominant_Topic,% Score,Original text,flag
107,3.0,0.6,[nan],1
8543,3.0,0.6,[nan],1
8534,3.0,0.6,[nan],1
8535,3.0,0.6,[nan],1
8536,3.0,0.6,[nan],1
8537,3.0,0.6,[nan],1
8538,3.0,0.6,[nan],1
8539,3.0,0.6,[nan],1
8540,3.0,0.6,[nan],1
8541,3.0,0.6,[nan],1
