In [1]:
# Loading required libraries and initializing
from langdetect import DetectorFactory
from pandas import Series, DataFrame
from string import punctuation
from nltk.tokenize import word_tokenize

In [2]:
# Loading custom defined functions
from tokenization import tokenize_sentence_nltk, tokenize_treetagger
from util import read_file, flatten_list_of_list, read_folder#, clean_sentences
from util import pick_first_language, is_english_wp_p, spell_correct_tokens
from util import detect_language, postprocess_sentences, get_redundaunt_info
from util import filter_data, filter_senders, filter_recipients, search_patterns
from pos_tagging import run_treetagger_pos_tag_text
from modeling import apply_bigram_trigram_model, run_aff_prop_with_distances, run_kmeans
from nltk.stem import WordNetLemmatizer
from modeling import run_word2vec_model, run_lda_topic_model, build_logistic_regression
from visualizing import visualize_word2vec_model
from util import get_semantic_similarity, get_character_similarity
from lemmatization import lemmatize_treetagger
from util import run_treetagger, join_tokens, parse_date, is_spelled_correctly, parse_date_fast, process_from_for_date
from json import load
from numpy import isnan

In [3]:
DetectorFactory.seed = 0

in_file = load(open("in_file.cfg", 'r'))
patterns_file = in_file["patterns_file"]
file_folder = in_file["file_folder"]
label = in_file["label"]
column = in_file["column"]
in_type = in_file["in_type"]
in_file = in_file["in_file"]
if file_folder == "file":
    strings = read_file(in_file, in_type = in_type)
    if in_type == "text":
        strings = tokenize_sentence_nltk(strings)
        strings = DataFrame(strings)[0]
    elif in_type == "html_chat":
        timestamp = strings[2]
        meta_data = strings[1]
        strings = strings[0]
        strings[label] = meta_data["Comment"]
        labels = strings[label]
        strings = strings[col]
    else:
        if label in strings.columns:
            labels = strings[label]
        strings = strings[col]
else:
    print(in_type)
    strings = read_folder(folder = in_file, in_type = in_type)

patterns = Series([".*" + x + ".*" for x in open(patterns_file, 'r').readlines()])

# For html_chat:
# strings -> DataFrame; meta_data -> Series; timestamp -> Series

# For csv, html_email, enron_email:
# strings -> DataFrame

# For folder:
# Sames as file

# html_email and enron_email will have columns 'conversation' (string) and 'meta_data' (dictionary)

# html_chat will have columns "itemId", "messageType", "messageDirection", "case", "captureDate", "policyAction", "statusMarkDate", "status", "status_reviewer", "commentDate", "comment", "comment_reviewer", "participants", "timestamp", "language", "sender", "recipients", "subject", "conversation", "num_of_conversation_turns" and "messages"

enron_email
Enron/maildir/allen-p/all_documents/1
Enron/maildir/allen-p/all_documents/10
Enron/maildir/allen-p/all_documents/100
Enron/maildir/allen-p/all_documents/101
Enron/maildir/allen-p/all_documents/102
Enron/maildir/allen-p/all_documents/103
Enron/maildir/allen-p/all_documents/104
Enron/maildir/allen-p/all_documents/105
Enron/maildir/allen-p/all_documents/106
Enron/maildir/allen-p/all_documents/107
Enron/maildir/allen-p/all_documents/108
Enron/maildir/allen-p/all_documents/109
Enron/maildir/allen-p/all_documents/11
Enron/maildir/allen-p/all_documents/110
Enron/maildir/allen-p/all_documents/111
Enron/maildir/allen-p/all_documents/112
Enron/maildir/allen-p/all_documents/113
Enron/maildir/allen-p/all_documents/114
Enron/maildir/allen-p/all_documents/115
Enron/maildir/allen-p/all_documents/116
Enron/maildir/allen-p/all_documents/117
Enron/maildir/allen-p/all_documents/118
Enron/maildir/allen-p/all_documents/119
Enron/maildir/allen-p/all_documents/120
Enron/maildir/allen-p/all_docume

Enron/maildir/allen-p/all_documents/345
Enron/maildir/allen-p/all_documents/346
Enron/maildir/allen-p/all_documents/347
Enron/maildir/allen-p/all_documents/348
Enron/maildir/allen-p/all_documents/349
Enron/maildir/allen-p/all_documents/35
Enron/maildir/allen-p/all_documents/350
Enron/maildir/allen-p/all_documents/351
Enron/maildir/allen-p/all_documents/352
Enron/maildir/allen-p/all_documents/353
Enron/maildir/allen-p/all_documents/354
Enron/maildir/allen-p/all_documents/355
Enron/maildir/allen-p/all_documents/356
Enron/maildir/allen-p/all_documents/357
Enron/maildir/allen-p/all_documents/358
Enron/maildir/allen-p/all_documents/359
Enron/maildir/allen-p/all_documents/36
Enron/maildir/allen-p/all_documents/360
Enron/maildir/allen-p/all_documents/361
Enron/maildir/allen-p/all_documents/362
Enron/maildir/allen-p/all_documents/363
Enron/maildir/allen-p/all_documents/364
Enron/maildir/allen-p/all_documents/365
Enron/maildir/allen-p/all_documents/366
Enron/maildir/allen-p/all_documents/367
En

Enron/maildir/allen-p/all_documents/596
Enron/maildir/allen-p/all_documents/597
Enron/maildir/allen-p/all_documents/598
Enron/maildir/allen-p/all_documents/599
Enron/maildir/allen-p/all_documents/6
Enron/maildir/allen-p/all_documents/60
Enron/maildir/allen-p/all_documents/600
Enron/maildir/allen-p/all_documents/601
Enron/maildir/allen-p/all_documents/602
Enron/maildir/allen-p/all_documents/603
Enron/maildir/allen-p/all_documents/604
Enron/maildir/allen-p/all_documents/605
Enron/maildir/allen-p/all_documents/606
Enron/maildir/allen-p/all_documents/607
Enron/maildir/allen-p/all_documents/608
Enron/maildir/allen-p/all_documents/609
Enron/maildir/allen-p/all_documents/61
Enron/maildir/allen-p/all_documents/610
Enron/maildir/allen-p/all_documents/611
Enron/maildir/allen-p/all_documents/612
Enron/maildir/allen-p/all_documents/613
Enron/maildir/allen-p/all_documents/614
Enron/maildir/allen-p/all_documents/615
Enron/maildir/allen-p/all_documents/616
Enron/maildir/allen-p/all_documents/617
Enro

Enron/maildir/allen-p/deleted_items/364
Enron/maildir/allen-p/deleted_items/365
Enron/maildir/allen-p/deleted_items/366
Enron/maildir/allen-p/deleted_items/368
Enron/maildir/allen-p/deleted_items/369
Enron/maildir/allen-p/deleted_items/37
Enron/maildir/allen-p/deleted_items/370
Enron/maildir/allen-p/deleted_items/371
Enron/maildir/allen-p/deleted_items/372
Enron/maildir/allen-p/deleted_items/373
Enron/maildir/allen-p/deleted_items/374
Enron/maildir/allen-p/deleted_items/375
Enron/maildir/allen-p/deleted_items/376
Enron/maildir/allen-p/deleted_items/377
Enron/maildir/allen-p/deleted_items/379
Enron/maildir/allen-p/deleted_items/38
Enron/maildir/allen-p/deleted_items/380
Enron/maildir/allen-p/deleted_items/381
Enron/maildir/allen-p/deleted_items/382
Enron/maildir/allen-p/deleted_items/383
Enron/maildir/allen-p/deleted_items/384
Enron/maildir/allen-p/deleted_items/385
Enron/maildir/allen-p/deleted_items/386
Enron/maildir/allen-p/deleted_items/387
Enron/maildir/allen-p/deleted_items/388
En

Enron/maildir/allen-p/discussion_threads/164
Enron/maildir/allen-p/discussion_threads/165
Enron/maildir/allen-p/discussion_threads/166
Enron/maildir/allen-p/discussion_threads/167
Enron/maildir/allen-p/discussion_threads/168
Enron/maildir/allen-p/discussion_threads/169
Enron/maildir/allen-p/discussion_threads/17
Enron/maildir/allen-p/discussion_threads/170
Enron/maildir/allen-p/discussion_threads/171
Enron/maildir/allen-p/discussion_threads/172
Enron/maildir/allen-p/discussion_threads/173
Enron/maildir/allen-p/discussion_threads/174
Enron/maildir/allen-p/discussion_threads/175
Enron/maildir/allen-p/discussion_threads/176
Enron/maildir/allen-p/discussion_threads/177
Enron/maildir/allen-p/discussion_threads/178
Enron/maildir/allen-p/discussion_threads/179
Enron/maildir/allen-p/discussion_threads/18
Enron/maildir/allen-p/discussion_threads/180
Enron/maildir/allen-p/discussion_threads/181
Enron/maildir/allen-p/discussion_threads/182
Enron/maildir/allen-p/discussion_threads/183
Enron/maildi

Enron/maildir/allen-p/discussion_threads/531
Enron/maildir/allen-p/discussion_threads/532
Enron/maildir/allen-p/discussion_threads/533
Enron/maildir/allen-p/discussion_threads/534
Enron/maildir/allen-p/discussion_threads/535
Enron/maildir/allen-p/discussion_threads/536
Enron/maildir/allen-p/discussion_threads/537
Enron/maildir/allen-p/discussion_threads/538
Enron/maildir/allen-p/discussion_threads/539
Enron/maildir/allen-p/discussion_threads/54
Enron/maildir/allen-p/discussion_threads/540
Enron/maildir/allen-p/discussion_threads/541
Enron/maildir/allen-p/discussion_threads/542
Enron/maildir/allen-p/discussion_threads/543
Enron/maildir/allen-p/discussion_threads/544
Enron/maildir/allen-p/discussion_threads/545
Enron/maildir/allen-p/discussion_threads/546
Enron/maildir/allen-p/discussion_threads/547
Enron/maildir/allen-p/discussion_threads/548
Enron/maildir/allen-p/discussion_threads/549
Enron/maildir/allen-p/discussion_threads/55
Enron/maildir/allen-p/discussion_threads/550
Enron/maildi

Enron/maildir/allen-p/sent/133
Enron/maildir/allen-p/sent/134
Enron/maildir/allen-p/sent/135
Enron/maildir/allen-p/sent/136
Enron/maildir/allen-p/sent/137
Enron/maildir/allen-p/sent/138
Enron/maildir/allen-p/sent/139
Enron/maildir/allen-p/sent/14
Enron/maildir/allen-p/sent/140
Enron/maildir/allen-p/sent/141
Enron/maildir/allen-p/sent/142
Enron/maildir/allen-p/sent/143
Enron/maildir/allen-p/sent/144
Enron/maildir/allen-p/sent/145
Enron/maildir/allen-p/sent/146
Enron/maildir/allen-p/sent/147
Enron/maildir/allen-p/sent/148
Enron/maildir/allen-p/sent/149
Enron/maildir/allen-p/sent/15
Enron/maildir/allen-p/sent/150
Enron/maildir/allen-p/sent/151
Enron/maildir/allen-p/sent/152
Enron/maildir/allen-p/sent/153
Enron/maildir/allen-p/sent/154
Enron/maildir/allen-p/sent/155
Enron/maildir/allen-p/sent/156
Enron/maildir/allen-p/sent/157
Enron/maildir/allen-p/sent/158
Enron/maildir/allen-p/sent/159
Enron/maildir/allen-p/sent/16
Enron/maildir/allen-p/sent/160
Enron/maildir/allen-p/sent/161
Enron/maild

Enron/maildir/allen-p/sent/394
Enron/maildir/allen-p/sent/395
Enron/maildir/allen-p/sent/396
Enron/maildir/allen-p/sent/397
Enron/maildir/allen-p/sent/398
Enron/maildir/allen-p/sent/399
Enron/maildir/allen-p/sent/4
Enron/maildir/allen-p/sent/40
Enron/maildir/allen-p/sent/400
Enron/maildir/allen-p/sent/401
Enron/maildir/allen-p/sent/402
Enron/maildir/allen-p/sent/403
Enron/maildir/allen-p/sent/404
Enron/maildir/allen-p/sent/405
Enron/maildir/allen-p/sent/406
Enron/maildir/allen-p/sent/407
Enron/maildir/allen-p/sent/408
Enron/maildir/allen-p/sent/409
Enron/maildir/allen-p/sent/41
Enron/maildir/allen-p/sent/410
Enron/maildir/allen-p/sent/411
Enron/maildir/allen-p/sent/412
Enron/maildir/allen-p/sent/413
Enron/maildir/allen-p/sent/414
Enron/maildir/allen-p/sent/415
Enron/maildir/allen-p/sent/416
Enron/maildir/allen-p/sent/417
Enron/maildir/allen-p/sent/418
Enron/maildir/allen-p/sent/419
Enron/maildir/allen-p/sent/42
Enron/maildir/allen-p/sent/420
Enron/maildir/allen-p/sent/421
Enron/maildir

Enron/maildir/allen-p/sent_items/21
Enron/maildir/allen-p/sent_items/210
Enron/maildir/allen-p/sent_items/211
Enron/maildir/allen-p/sent_items/212
Enron/maildir/allen-p/sent_items/213
Enron/maildir/allen-p/sent_items/214
Enron/maildir/allen-p/sent_items/215
Enron/maildir/allen-p/sent_items/216
Enron/maildir/allen-p/sent_items/217
Enron/maildir/allen-p/sent_items/218
Enron/maildir/allen-p/sent_items/219
Enron/maildir/allen-p/sent_items/22
Enron/maildir/allen-p/sent_items/220
Enron/maildir/allen-p/sent_items/221
Enron/maildir/allen-p/sent_items/222
Enron/maildir/allen-p/sent_items/223
Enron/maildir/allen-p/sent_items/224
Enron/maildir/allen-p/sent_items/225
Enron/maildir/allen-p/sent_items/226
Enron/maildir/allen-p/sent_items/227
Enron/maildir/allen-p/sent_items/228
Enron/maildir/allen-p/sent_items/229
Enron/maildir/allen-p/sent_items/23
Enron/maildir/allen-p/sent_items/230
Enron/maildir/allen-p/sent_items/231
Enron/maildir/allen-p/sent_items/232
Enron/maildir/allen-p/sent_items/233
Enro

Enron/maildir/allen-p/_sent_mail/1
Enron/maildir/allen-p/_sent_mail/10
Enron/maildir/allen-p/_sent_mail/100
Enron/maildir/allen-p/_sent_mail/1000
Enron/maildir/allen-p/_sent_mail/1001
Enron/maildir/allen-p/_sent_mail/1002
Enron/maildir/allen-p/_sent_mail/1003
Enron/maildir/allen-p/_sent_mail/1004
Enron/maildir/allen-p/_sent_mail/101
Enron/maildir/allen-p/_sent_mail/102
Enron/maildir/allen-p/_sent_mail/103
Enron/maildir/allen-p/_sent_mail/104
Enron/maildir/allen-p/_sent_mail/105
Enron/maildir/allen-p/_sent_mail/106
Enron/maildir/allen-p/_sent_mail/107
Enron/maildir/allen-p/_sent_mail/108
Enron/maildir/allen-p/_sent_mail/109
Enron/maildir/allen-p/_sent_mail/11
Enron/maildir/allen-p/_sent_mail/110
Enron/maildir/allen-p/_sent_mail/111
Enron/maildir/allen-p/_sent_mail/112
Enron/maildir/allen-p/_sent_mail/113
Enron/maildir/allen-p/_sent_mail/114
Enron/maildir/allen-p/_sent_mail/115
Enron/maildir/allen-p/_sent_mail/116
Enron/maildir/allen-p/_sent_mail/117
Enron/maildir/allen-p/_sent_mail/118


Enron/maildir/allen-p/_sent_mail/360
Enron/maildir/allen-p/_sent_mail/361
Enron/maildir/allen-p/_sent_mail/362
Enron/maildir/allen-p/_sent_mail/363
Enron/maildir/allen-p/_sent_mail/364
Enron/maildir/allen-p/_sent_mail/366
Enron/maildir/allen-p/_sent_mail/368
Enron/maildir/allen-p/_sent_mail/369
Enron/maildir/allen-p/_sent_mail/37
Enron/maildir/allen-p/_sent_mail/370
Enron/maildir/allen-p/_sent_mail/371
Enron/maildir/allen-p/_sent_mail/372
Enron/maildir/allen-p/_sent_mail/373
Enron/maildir/allen-p/_sent_mail/375
Enron/maildir/allen-p/_sent_mail/376
Enron/maildir/allen-p/_sent_mail/377
Enron/maildir/allen-p/_sent_mail/378
Enron/maildir/allen-p/_sent_mail/379
Enron/maildir/allen-p/_sent_mail/38
Enron/maildir/allen-p/_sent_mail/380
Enron/maildir/allen-p/_sent_mail/381
Enron/maildir/allen-p/_sent_mail/382
Enron/maildir/allen-p/_sent_mail/384
Enron/maildir/allen-p/_sent_mail/386
Enron/maildir/allen-p/_sent_mail/387
Enron/maildir/allen-p/_sent_mail/388
Enron/maildir/allen-p/_sent_mail/389
Enr

Enron/maildir/allen-p/_sent_mail/83
Enron/maildir/allen-p/_sent_mail/84
Enron/maildir/allen-p/_sent_mail/85
Enron/maildir/allen-p/_sent_mail/86
Enron/maildir/allen-p/_sent_mail/87
Enron/maildir/allen-p/_sent_mail/88
Enron/maildir/allen-p/_sent_mail/89
Enron/maildir/allen-p/_sent_mail/9
Enron/maildir/allen-p/_sent_mail/90
Enron/maildir/allen-p/_sent_mail/91
Enron/maildir/allen-p/_sent_mail/92
Enron/maildir/allen-p/_sent_mail/93
Enron/maildir/allen-p/_sent_mail/94
Enron/maildir/allen-p/_sent_mail/95
Enron/maildir/allen-p/_sent_mail/96
Enron/maildir/allen-p/_sent_mail/97
Enron/maildir/allen-p/_sent_mail/98
Enron/maildir/allen-p/_sent_mail/99


In [4]:
strings.shape

(3494, 2)

In [5]:
def na_date_filling_possible(x):
    try:
        if type(x) == float:
            return " on " in x['from'] and isnan(x['date'])
        else:
            return " on " in x['from'] and x['date'] is None
    except:
        return False

if in_type == "html_email" or in_type == "enron_email":
    conv = strings["conversation"].apply(tokenize_sentence_nltk)
    strings = strings["meta_data"].apply(Series)
    strings["conversation"] = conv
    strings["sent"] = strings["sent"].apply(parse_date)
    strings["date"] = strings["date"].apply(parse_date) # This is time consuming function, but it is robust
    if in_type == "enron_email":
        condition = strings['date'].apply(lambda x: x is None)
        strings["date"][condition] = strings["sent"][condition].tolist()

condition = strings["date"].apply(lambda x: x is None)
print(condition.sum())

# For html_email, enron_email ->
# 1) tokenize the strings to create list of sentences
# 2) convert meta data from dictionary to columns
# 3) parse "Sent" and "Date" columns from string to datetime.datetime (1900-01-01 00:00:00 is equivalent to NULL)

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  matches = (new_other == np.array(other))


In [9]:
condition = strings[['from', 'date']].apply(lambda x: na_date_filling_possible(x), axis=1)
if condition.sum() > 0:
    df = DataFrame(strings['from'][condition].apply(lambda x: process_from_for_date(x)).tolist())
    df.columns = ["from", "date"]
    strings['from'][condition] = df['from'].tolist()
    strings['date'][condition] = df['date'].tolist()

In [10]:
def is_nan(x):
    try:
        return isnan(x)
    except:
        return False

strings['date'][strings['date'].apply(lambda x: is_nan(x))] = None
condition = strings['date'].apply(lambda x: x is None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [11]:
others = strings[strings['date'].apply(lambda x: x is None)]
strings = strings[strings['date'].apply(lambda x: x is not None)]

In [12]:
strings.head()

Unnamed: 0,cc,contenttransferencoding,contenttype,date,from,messageid,mimeversion,sent,sentby,subject,to,xbcc,xcc,xfilename,xfolder,xfrom,xorigin,xto,conversation
0,,7bit,text/plain; charset=us-ascii,2000-12-13 18:41:00,1.11913372.-2@multexinvestornetwork.com,<29790972.1075855665306.JavaMail.evans@thyme>,1.0,NaT,,"December 14, 2000 - Bear Stearns' predictions ...",pallen@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Multex Investor <1.11913372.-2@multexinvestorn...,Allen-P,<pallen@enron.com>,[In today's Daily Update you'll find free repo...
1,,quoted-printable,text/plain; charset=ANSI_X3.4-1968,2000-12-13 08:35:00,messenger@ecm.bloomberg.com,<21975671.1075855665520.JavaMail.evans@thyme>,1.0,NaT,,Bloomberg Power Lines Report,,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,"""Bloomberg.com"" <messenger@ecm.bloomberg.com>",Allen-P,(undisclosed-recipients),[Here is today's copy of Bloomberg Power Lines...
2,,7bit,text/plain; charset=us-ascii,2000-10-09 07:16:00,phillip.allen@enron.com,<7452188.1075855667684.JavaMail.evans@thyme>,1.0,NaT,,Consolidated positions: Issues & To Do list,keith.holst@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,Keith Holst,[---------------------- Forwarded by Phillip K...
3,,7bit,text/plain; charset=us-ascii,2000-10-09 07:00:00,phillip.allen@enron.com,<23790115.1075855667708.JavaMail.evans@thyme>,1.0,NaT,,Consolidated positions: Issues & To Do list,keith.holst@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,Keith Holst,[---------------------- Forwarded by Phillip K...
4,,7bit,text/plain; charset=us-ascii,2000-10-05 06:26:00,phillip.allen@enron.com,<5860470.1075855667730.JavaMail.evans@thyme>,1.0,NaT,,,david.delainey@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,David W Delainey,"[Dave, ., Here are the names of the west desk ..."


In [13]:
others.shape

(0, 19)

In [14]:
if in_type == "html_chat":
    redundant = get_redundaunt_info(strings)
    total_redundant = redundant.sum()
    from re import findall
    unnecessary1 = strings['sender'].apply(lambda x: len(findall("^gg.*", x.lower())) > 0)
    unnecessary2 = strings['participants'].apply(lambda x: sum([len(findall("^gg.*|\;gg.*", y)) for y in x]) > 0)
    unnecessary = unnecessary1 | unnecessary2
    total_unnecessary = unnecessary.sum()
    # Deduplication
    strings = filter_senders(strings)
    strings = filter_recipients(strings)
    strings = filter_data(strings)
    strings1 = strings.drop(['messages'], axis = 1).reset_index(drop = True)
    strings = strings['messages'].reset_index(drop = True)
    strings = strings.apply(lambda x: " ".join(x))

# For html_chat ->
# Remove unnecessary chat history: gg.* is the pattern for unwanted senders/participants
# Remove duplicate chats
# strings1 is a DataFrame with all columns except 'messages'
# strings is a Series (string) which is used for downstream NLP

In [15]:
strings.head()

Unnamed: 0,cc,contenttransferencoding,contenttype,date,from,messageid,mimeversion,sent,sentby,subject,to,xbcc,xcc,xfilename,xfolder,xfrom,xorigin,xto,conversation
0,,7bit,text/plain; charset=us-ascii,2000-12-13 18:41:00,1.11913372.-2@multexinvestornetwork.com,<29790972.1075855665306.JavaMail.evans@thyme>,1.0,NaT,,"December 14, 2000 - Bear Stearns' predictions ...",pallen@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Multex Investor <1.11913372.-2@multexinvestorn...,Allen-P,<pallen@enron.com>,[In today's Daily Update you'll find free repo...
1,,quoted-printable,text/plain; charset=ANSI_X3.4-1968,2000-12-13 08:35:00,messenger@ecm.bloomberg.com,<21975671.1075855665520.JavaMail.evans@thyme>,1.0,NaT,,Bloomberg Power Lines Report,,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,"""Bloomberg.com"" <messenger@ecm.bloomberg.com>",Allen-P,(undisclosed-recipients),[Here is today's copy of Bloomberg Power Lines...
2,,7bit,text/plain; charset=us-ascii,2000-10-09 07:16:00,phillip.allen@enron.com,<7452188.1075855667684.JavaMail.evans@thyme>,1.0,NaT,,Consolidated positions: Issues & To Do list,keith.holst@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,Keith Holst,[---------------------- Forwarded by Phillip K...
3,,7bit,text/plain; charset=us-ascii,2000-10-09 07:00:00,phillip.allen@enron.com,<23790115.1075855667708.JavaMail.evans@thyme>,1.0,NaT,,Consolidated positions: Issues & To Do list,keith.holst@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,Keith Holst,[---------------------- Forwarded by Phillip K...
4,,7bit,text/plain; charset=us-ascii,2000-10-05 06:26:00,phillip.allen@enron.com,<5860470.1075855667730.JavaMail.evans@thyme>,1.0,NaT,,,david.delainey@enron.com,,,pallen.nsf,Phillip_Allen_Dec2000\Notes Folders\All documents,Phillip K Allen,Allen-P,David W Delainey,"[Dave, ., Here are the names of the west desk ..."


In [16]:
def get_Dt(x):
    try:
        return parse_date(str(x.year) + "/" + str(x.month) + "/" + str(x.day))
    except:
        return None

strings["Dt"] = strings["date"].apply(lambda x: get_Dt(x))

In [17]:
user_conversations = strings.groupby(["from", "Dt"]).apply(lambda x: [y for y in x["conversation"].tolist() if len(y)>0]).reset_index(drop = False)
user_conversations.columns = ["from", "Dt", "conversations"]
user_conversations.head()

Unnamed: 0,from,Dt,conversations
0,"""Darrell Jack"" <djack@keyad.com <mail",2001-11-29,"[[Hey Phillip, ., I have gone into travel plan..."
1,"""Darrell Jack"" <djack@keyad.com>@ENRON",2001-11-29,"[[Hey Phillip, ., I have gone into travel plan..."
2,"""Greg Thorse"" <gthorse@about-cis.com <mail",2001-12-10,"[[Phillip; ., These are just what I started wi..."
3,"""Greg Thorse"" <gthorse@keyad.com>@ENRON",2001-10-29,"[[Phillip, ., I need to get the contract for G..."
4,"""Greg Thorse"" <gthorse@keyad.com>@ENRON [mail",2001-06-19,"[[See Attachement ., - Phillip & Kieth Lender..."


In [22]:
user_conversations['conversations'].head(2)[0]

[['Hey Phillip, .',
  'I have gone into travel planning mode and wanted to invite both you and .',
  'Keith on a scuba expedition .',
  'Greg, our friend Larry Hudler and I are planning a trip to Fiji January 24th .',
  'to Feb. 2nd .',
  'All in, the trip should be about $2,000 .',
  'This includes .',
  'Airfare, Condo, Diving, Food, and Drink, and maybe a little more drink .',
  'Let me know if either of you can attend .',
  'Darrell .']]

In [23]:
user_conversations['conversations'].head(2)[1]

[['Hey Phillip, .',
  'I have gone into travel planning mode and wanted to invite both you and .',
  'Keith on a scuba expedition .',
  'Greg, our friend Larry Hudler and I are planning a trip to Fiji January 24th .',
  'to Feb. 2nd .',
  'All in, the trip should be about $2,000 .',
  'This includes .',
  'Airfare, Condo, Diving, Food, and Drink, and maybe a little more drink .',
  'Let me know if either of you can attend .',
  'Darrell .']]

In [None]:
try:
    strings = strings["conversation"]
except:
    pass

strings = strings.apply(postprocess_sentences)

# For html_email and enron_email -> pick conversation column for downstream NLP

In [None]:
from re import sub
strings = strings.apply(lambda string: sub(pattern = "[\ ]{2,}", repl = " ", string = string))
strings = strings.apply(lambda string: sub(pattern = "[\ ]{1,}[\.]", repl = " .", string = string))
strings = strings.apply(lambda string: sub(pattern = "[\ \.]*[\.]", repl = " .", string = string))

In [None]:
strings.head()

In [None]:
# Identifying keyword hits
hits = strings.apply(lambda x: search_patterns(x, patterns))
hits['any'] = hits.apply(sum, axis = 0)
hit_strings = strings[hits['any'] > 0]

# Pattern matching -> store in 'hits' if any of the string patterns match

In [None]:
# Language filter for downstream NLP
languages = strings.apply(detect_language)

# Picking the language with highest probability
first_language = languages.apply(pick_first_language)

# Keeping only English text
english_only = first_language.apply(is_english_wp_p)
strings = strings[english_only]
# labels = labels[english_only].tolist()

# Retain only English text in strings
# labels is for supervised learning (future use)

In [None]:
# Processing English sentences:
# 1) Tokenization
sentences = strings.apply(tokenize_treetagger)
lengths = sentences.apply(len)

# new_labels = []
# for i in range(len(lengths)):
#     for j in range(lengths[i]):
#         new_labels.append(labels[i])
# new_labels = Series(new_labels)
# sentences1 = flatten_list_of_list(sentences)

# sentences -> list of (list for tokens) -> outer list: one list per sentence

In [None]:
sentences = sentences[lengths > 0]

# lengths -> used to remove sentences that are not tokenized

In [None]:
# 2) Run part-of-speech tagging on clean sentences
sentences1 = sentences.apply(lambda x: join_tokens(x))
pos = sentences1.apply(run_treetagger_pos_tag_text).apply(DataFrame)

# sentences1 -> tokens joined back into single string with '. ' separating sentences
# pos -> DataFrame with part-of-speech of sentences1

In [None]:
pos_to_keep = {'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NP', 'NPS', 'RB', 'RBR', 'RBS', 'RP', 'VV', 'VVD', 'VVG', 'VVN', 'VVP', 'VVZ'}
pos_new = pos.apply(lambda x: " ".join(x[x[1].apply(lambda y: y in pos_to_keep)][0]))
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word', min_df=1, stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9\-_]{2,}')
data_vectorized = vectorizer.fit_transform(pos_new)
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_topics=20, max_iter=10, learning_method='online', random_state=1, batch_size=16, evaluate_every=-1, n_jobs=-1)
lda_model.fit(data_vectorized)
from sklearn.model_selection import GridSearchCV
n_topics = [i+1 for i in range(9)] + [10, 15, 20, 25, 30, 40, 50, 75, 100]
search_params = {'n_components': n_topics, 'learning_decay': [.5, .7, .9]}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(data_vectorized)
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [None]:
n_topics = [i+1 for i in range(9)] + [i for i in [10, 15, 20, 25, 30, 40, 50, 75, 100] if i < strings.shape[0]]

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [None]:
best_lda_model = model.best_estimator_
lda_output = best_lda_model.transform(data_vectorized)
from pandas import DataFrame
from numpy import round, argmax
# topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_topics)]
df_document_topic = DataFrame(round(lda_output, 2))
dominant_topic = argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()

# View
df_topic_keywords.head()

In [None]:
from numpy import array
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)

In [None]:
df_topic_keywords = DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [None]:
# 3) Spell correct - currently correct only disjoint words
lengths = pos.apply(len)
inc_sentences = sentences1[lengths == 0]
# inc_labels = new_labels[lengths == 0]
pos = pos[lengths > 0]
# labels = new_labels[lengths > 0]
sentence_tokens = pos.apply(spell_correct_tokens)

# lengths -> used to remove sentences for which pos is not tagged
# sentences_tokens -> combine words in pos if: a) word and adjacent word are incorrectly spelled, b) combination of words is correctly spelled

In [None]:
# 4) Combine tokens to form bigrams and trigrams
# sentence_tokens = sentences.apply(tokenize_treetagger)
trigrams = apply_bigram_trigram_model(sentence_tokens)

# trigrams -> Series of lists: unigrams, bigram phrases (word1_word2) and trigram phrases (word1_word2_word3)

In [None]:
# 5) Form the sentence back from tokens
sentences1 = ["".join([" "+lemmatize_treetagger(run_treetagger(i.lower())) if not i.startswith("'") and i not in punctuation
                       else i for i in tokens]).strip() for tokens in trigrams]
# labels = labels.tolist() + inc_labels.tolist()

# Join the sentences back
# sentences1 -> list of strings

In [None]:
# 6) Write clean text to text file - one line per sentence
out_file = open("sample.txt", "w")
for sent in sentences1:
    out_file.write(sent.lower().replace("( ", "(").replace(" )", ")").replace("replaced-dns ", "").replace("replaced-dns", "")+"\n")

out_file.close()

# sentences1 written to text file "sample.txt" for word2vec and LDA models

In [None]:
# 7) Run word2vec model and store word representations
model = run_word2vec_model("sample.txt")
model.wv.save_word2vec_format("big.w2v")

# Build word2vec model on "sample.txt" and save model to "big.w2v"

In [None]:
# 8) Visualizing the word2vec model
visualize_word2vec_model(model)

# Visualize word2vec model in 2d using t-SNE

In [None]:
# 9) Setting up the data for building logistic regression model
#df = zeros((len(sentences1), 100))
#for i, words in enumerate(trigrams):
#    for word in words:
#        try:
#            df[i] = df[i] + model[word]
#        except:
#            continue

#while(i<len(sentences1)):
#    i += 1
#
#df = DataFrame(df)
#df[label] = labels

# This is for building supervised learning model using sentence embeddings as predictors

In [None]:
# 10) Building and saving the logistic regression model with L1 penalty
#lr_model = build_logistic_regression(df, label)
#dump(lr_model, open("logistic_model.pkl", 'wb'))

# Build and dump the model

In [None]:
# 11) Topic modeling (Optional)
#lda_model = run_lda_topic_model(text_file = "sample_cleaned.txt")
# Sample topic modeling output - Topic 1
#lda_model.print_topic(1)

In [None]:
sim1 = get_semantic_similarity(model)

# Get m x m matrix of semantic (cosine) similarity between words in vocabulary

In [None]:
vocab = list(model.wv.vocab)

# Get all m words in vocabulary

In [None]:
# ratio = get_character_similarity(vocab, 'ratio')

# Compute m x m character-based similarity metric between words in vocabulary

In [None]:
# partial_ratio = get_character_similarity(vocab, "partial_ratio")

In [None]:
# token_sort_ratio = get_character_similarity(vocab, "token_sort_ratio")

In [None]:
# token_set_ratio = get_character_similarity(vocab, "token_set_ratio")

In [None]:
# sim1 = sim1[ratio.columns]
# sim1 = sim1.loc[ratio.columns]

# sim1 is m x m matrix

In [None]:
# semantic_weight = 0.5
# ratio_weight = 0.4
# partial_ratio_weight = 0.4
# token_sort_ratio_weight = 0.1
# sim = semantic_weight*sim1 + (ratio_weight*ratio + partial_ratio_weight*partial_ratio + token_sort_ratio_weight*token_sort_ratio + (1-ratio_weight-partial_ratio_weight-token_sort_ratio_weight)*token_set_ratio)*(1-semantic_weight)
# dist = 1 - sim

# Currently heuristic weights are assigned for different similarities
# dist is m x m matrix

In [None]:
# aff = run_aff_prop_with_distances(dist)

# Run affinity propagation model on computed composite distance matrix and store in aff

In [None]:
# visualize_word2vec_model(model, aff.labels_)

# Visualize word2vec model with labels as per affinity propagation clusters

In [None]:
# columns = sim.columns
# condition = Series(sim.columns).apply(is_spelled_correctly)
# sim = sim.loc[columns[~condition]]
# sim = sim[columns[condition]]

# Identify mis-spelled words and find nearest words; words with '_' are also considered correct (this needs some thought)

In [None]:
# sim.apply(lambda x: x.sort_values().index[0], axis = 1)

In [None]:
df = DataFrame(model.wv[model.wv.vocab])
df.columns = ['c' + str(i) for i in range(df.shape[1])]
km = run_kmeans(model.wv[model.wv.vocab])

# Run kmeans model and store in km object

In [None]:
visualize_word2vec_model(model, km.cluster)

# Visualize word2vec with kmeans cluster labels

In [None]:
lda_model, corpus, dictionary = run_lda_topic_model(text_file = "sample.txt")

# Run gensim LDA model on "sample.txt" and return model (lda_model), corpus (corpus) and dictionary (dictionary)

In [None]:
lda_model.print_topics(-1)

# Prints all topics

In [None]:
from util import get_word_lda_topics
Series(list(lda_model.id2word.values())).apply(lambda x: get_word_lda_topics(lda_model, x))

# Supposed to get list of all topics for each word in vocab -> not working for some reason (returns None)

In [None]:
from gensim.models.ldamodel import LdaModel


In [None]:
from visualizing import visualize_lda_topics
visualize_lda_topics(lda_model, corpus, dictionary)

# Visualize LDA model using MDS

In [None]:
from sentiment_analysis import tag_sentiment_nltk
from util import get_sentiment_with_highest_score

In [None]:
Series(sentences1).apply(tag_sentiment_nltk).apply(get_sentiment_with_highest_score)

# Tag sentiments and return sentiment with highest probability

In [None]:
from sentiment_analysis import tag_sentiment_stanfordcorenlp
tag_sentiment_stanfordcorenlp(sentences1[0])

In [None]:
Series(sentences1).apply(tag_sentiment_stanfordcorenlp)

In [None]:
sentences2 = sentences.apply(lambda x: " ".join(x))
sentences2.apply(tag_sentiment_stanfordcorenlp)