## Neural Network Method

In [97]:
import random
import operator
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.corpus import names
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import math
import time
import sys
from scipy import sparse
import string
from normalization import normalize_corpus
from utils import build_feature_matrix
import scipy.sparse as sp
%matplotlib inline
import warnings
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
warnings.filterwarnings('ignore')
from time import time
from __future__ import division
from operator import le, lt, ge, gt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from collections import defaultdict

#### Some useful functions

In [98]:
def log_process(cursor, finish_cursor, start_time = None):
    percentage = float(cursor + 1)/finish_cursor
    now_time = time()
    time_to_finish = ((now_time - start_time)/percentage) - (now_time - start_time)
    mn, sc = int(time_to_finish//60), int((time_to_finish/60 - time_to_finish//60)*60)
    if start_time:
        sys.stdout.write("\r%.2f%% ----- Temps restant estimé: %d min %d sec -----" %(100*percentage, mn, sc))
        sys.stdout.flush()
    else:
        sys.stdout.write("\r%.2f%%" %(100*percentage))
        sys.stdout.flush()

def get_true_recipient(mid):
    return training_info[training_info.mid == mid].recipients_valid.tolist()[0]
        
def simplify_date(date):
    return re.findall("[0-9]{4}-[0-9]{2}-[0-9]{2}", date)[0]

def is_email_valid(email):
    return True if re.match(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", email) else False

def fix_mail_list(recipients):
    recipients = recipients.split(' ')
    return [email for email in recipients if is_email_valid(email)]

def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def compute_corpus_term_idfs(corpus_features, norm_corpus):
    
    dfs = np.diff(sp.csc_matrix(corpus_features, copy=True).indptr)
    dfs = 1 + dfs # to smoothen idf later
    total_docs = 1 + len(norm_corpus)
    idfs = 1.0 + np.log(float(total_docs) / dfs)
    return idfs


def compute_bm25_similarity(doc_features, corpus_features,
                            corpus_doc_lengths, avg_doc_length,
                            term_idfs, k1=1.5, b=0.75, top_n=3):
    doc_features.data = np.ones(len(doc_features.data))
    
    doc_features = doc_features.tocsr()
    term_idfs_sparse = csr_matrix(term_idfs)
    doc_idfs = doc_features.multiply(term_idfs_sparse)   
        
    multiplicator = csr_matrix([k1 + 1])
    numerator_coeff = corpus_features.multiply(multiplicator)
    numerator = numerator_coeff.multiply(doc_idfs)
        
    denominator_coeff =  k1 * (1 - b + (b * (corpus_doc_lengths / avg_doc_length)))
    denominator_coeff = csr_matrix(np.vstack(denominator_coeff).T)
    
    doc_idfs_copy = doc_idfs[:]
    doc_idfs_copy.data = np.ones(len(doc_idfs_copy.data))
    doc_idfs_ones = csc_matrix(doc_idfs_copy)
        
    f_q_D = corpus_features.multiply(doc_idfs_ones)
    
    doc_idfs_ones = doc_idfs_ones.transpose()
    denominator_coeff = doc_idfs_ones.multiply(denominator_coeff).transpose()

    denominator = f_q_D + denominator_coeff
    denominator.data = 1./denominator.data
    
    divide = numerator.multiply(denominator)
    bm25_scores = np.array(divide.sum(axis=1).flatten())[0]
            
    top_docs = bm25_scores.argsort()[::-1][:top_n]
    top_docs_with_score = [(index, round(bm25_scores[index], 3))
                            for index in top_docs]
    
    return top_docs_with_score

def get_names(mail_adress):
    match = re.findall(r'([a-z]+)\.([a-z]+)@[a-z]+\.[a-z]+', mail_adress)
    if match:
        return match[0]
    return None

def is_name_in_mail(mail, names):
    if not names:
        return -1
    mail_intro = mail.split(' ')[:10]
    return 1 if (names[0] in mail or names[1] in mail_intro) else -1

#### To preprocess the body

In [99]:
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from HTMLParser import HTMLParser
import unicodedata

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list = stopword_list + ['mr', 'mrs', 'come', 'go', 'get',
                                 'tell', 'listen', 'one', 'two', 'three',
                                 'four', 'five', 'six', 'seven', 'eight',
                                 'nine', 'zero', 'join', 'find', 'make',
                                 'say', 'ask', 'tell', 'see', 'try', 'back',
                                 'also']
wnl = WordNetLemmatizer()
html_parser = HTMLParser()

def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
    
from pattern.en import tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags    
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text
    

def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
    
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def unescape_html(parser, text):
    
    return parser.unescape(text)

def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):
    
    ## Log the process
    start_time = time()
    finish_cursor = len(corpus)

    normalized_corpus = []    
    for idx, text in enumerate(corpus):
        if idx % 100 == 0 or idx == finish_cursor-1:
            log_process(cursor=idx, finish_cursor=finish_cursor, start_time=start_time)
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)
        
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus


def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

In [100]:
path_to_data = './'

In [101]:
##########################
# load some of the files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)
test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
test_info = pd.read_csv(path_to_data + 'test_info.csv', sep=',', header=0)

In [102]:
training.head()

Unnamed: 0,sender,mids
0,karen.buckley@enron.com,158713 158697 200301 158679 278595 298162 2002...
1,amr.ibrahim@enron.com,215241 3437 215640 3506 191790 3517 3520 3562 ...
2,andrea.ring@enron.com,270705 270706 270707 270708 270709 270710 2707...
3,sylvia.hu@enron.com,111444 111422 183084 111412 111347 110883 1105...
4,phillip.platter@enron.com,327074 327384 327385 264443 274124 274125 2741...


In [103]:
training_info.head()

Unnamed: 0,mid,date,body,recipients
0,60,2000-07-25 08:14:00,Legal has been assessing the risks of doing bl...,robert.badeer@enron.com murray.o neil@enron.co...
1,66,2000-08-03 02:56:00,Attached is a spreadsheet to estimate export f...,kim.ward@enron.com robert.badeer@enron.com mur...
2,74,2000-08-15 05:37:00,Kevin/Bob: Here is a quick rundown on the cons...,robert.badeer@enron.com john.massey@enron.com ...
3,80,2000-08-20 14:12:00,check this out and let everyone know what s up...,robert.badeer@enron.com jeff.richter@enron.com
4,83,2000-08-22 08:17:00,Further to your letter to us (addressed to Mr....,pgillman@schiffhardin.com kamarlantes@calpx.co...


In [104]:
test.head()

Unnamed: 0,sender,mids
0,karen.buckley@enron.com,298389 332383 298390 284071 366982 81773 81791...
1,amr.ibrahim@enron.com,48260 48465 50344 48268 50330 48237 189979 189...
2,andrea.ring@enron.com,366364 271168 271172 271167 271189
3,sylvia.hu@enron.com,134931 134856 233549 233517 134895 233584 3736...
4,phillip.platter@enron.com,274220 274225 274215 274223 274214 274207 2742...


In [105]:
test_info.head()

Unnamed: 0,mid,date,body
0,1577,2001-11-19 06:59:51,Note: Stocks of heating oil are very high for...
1,1750,2002-03-05 08:46:57,"Kevin Hyatt and I are going for ""sghetti"" at S..."
2,1916,2002-02-13 14:17:39,This was forwarded to me and it is funny. - Wi...
3,2094,2002-01-22 11:33:56,I will be in to and happy to assist too. I ma...
4,2205,2002-01-11 07:12:19,Thanks. I needed a morning chuckle.


In [108]:
def get_messages_written_by_u_to_c(u=None, 
                                   c=None, 
                                   t_init=None, 
                                   t_final=None, 
                                   t_init_include=True,
                                   t_final_include=True,
                                   set_=training, 
                                   set_info=training_info):
    """
    This function makes it possible to retrieve all the ids of the messages sent BY u to c, where u and c are mail addresses (String)
    If u = None, then all messages sent to c, regardless of the sender
    If c = None, then we get all messages sent PAR u, whatever the receiver
    If u = None and c = None, we get all messages ...
    One can choose a time filter by imposing a minimum date (t_init) and a maximum date (t_final)
    You can optionally include or exclude these dates via t_init_include, t_final_include.
    Finally, we can work with simplified dates (YYYY-MM-DD) or complete (YYYY-MM-DD hh: mm: ss)
    Example:
    --------
    Get_messages_written_by_u_to_c (u='andrea.ring@enron.com ',
                                   C='amr.ibrahim@enron.com ',
                                   T_init = '1990-04-01',
                                   T_final = '2000-02-13 23:12:00')
    Returns all message identifiers sent by andrea.ring@enron.com to amr.ibrahim@enron.com between the
    1 April 1990 (included - by default) and 13 February 2000, 23h12min00sec (included - by default).    
    """
    try: 
        # mids of messages sent by u
        mids = list(map(int, set_[set_['sender'] == u]['mids'].values[0].split(' '))) if u else None
        # Filter messages by mid
        set_info_filtered = set_info[set_info['mid'].isin(mids)] if u else set_info
        # Filter messages by time (t_init)
        set_info_filtered = set_info_filtered[(ge if t_init_include else gt)(set_info_filtered['date_simplified' if len(t_init)==10 else 'date'], t_init)] if t_init else set_info_filtered
        # Filter messages by time (t_final)
        set_info_filtered = set_info_filtered[(le if t_final_include else lt)(set_info_filtered['date_simplified' if len(t_final)==10 else 'date'], t_final)] if t_final else set_info_filtered
        # Filter by recipient
        set_info_filtered = set_info_filtered[set_info_filtered['recipients_valid'].map(lambda recipients: c in recipients)] if c else set_info_filtered 
        # Return mids of relevant messages
        return set_info_filtered.mid.values
    except: # If an exeption is raised, it's because no message was found, in this case, we just return an empty array.
        return []

In [109]:
# Carnet d'adresse des senders pour les dates indiquées
def create_handbook(set_=training, set_info=training_info, t_final=None, t_init=None, t_init_include=True, t_final_include=False):
    all_senders = set_.sender.values
    d = {}
    for sender in all_senders:
        sent_by_sender = get_messages_written_by_u_to_c(u=sender, t_init=t_init, t_final=t_final, t_init_include=t_init_include, t_final_include=t_final_include)
        sent_to_sender = get_messages_written_by_u_to_c(c=sender, t_init=t_init, t_final=t_final, t_init_include=t_init_include, t_final_include=t_final_include)
        recipients = []
        # Emails de ceux qui ont reçu un mail de sender
        for rec_list in set_info[set_info.mid.isin(map(int, sent_by_sender))].recipients_valid:
            for rec in rec_list:
                recipients.append(rec)
        # Emails de ceux qui ont envoyé un mail à sender
        for mid in map(str, sent_to_sender):
            recipients.append(map_mid_to_sender[int(mid)])
        recipients = set(recipients)
        d[sender] = recipients
    return d

#### We add date_simplified and we correct date with bad format

In [110]:
training_info['date_simplified'] = training_info.date.map(simplify_date)
test_info['date_simplified'] = test_info.date.map(simplify_date)

In [111]:
training_info.date_simplified = training_info.date_simplified.map(lambda x: x if x[0] != '0' else '2' + x[1:])
training_info.date = training_info.date.map(lambda x: x if x[0] != '0' else '2' + x[1:])
test_info.date_simplified = test_info.date_simplified.map(lambda x: x if x[0] != '0' else '2' + x[1:])
test_info.date = test_info.date.map(lambda x: x if x[0] != '0' else '2' + x[1:])

#### We sort training_info and test_info by date

In [112]:
training_info.sort_values('date_simplified', inplace=True)
test_info.sort_values('date_simplified', inplace=True)

#### We correct email with bad format

In [113]:
training_info['recipients_valid'] = training_info.recipients.map(fix_mail_list)
training.sender = training.sender.map(fix_mail_list).map(lambda sender_list: sender_list[0])
test.sender = test.sender.map(fix_mail_list).map(lambda sender_list: sender_list[0])

In [114]:
recipients = []
for recipient_list in training_info['recipients_valid']:
    for recipient in recipient_list:
        recipients.append(recipient)
unique_recipients = set(recipients)

#### Some useful dictionaries

In [115]:
map_mid_to_date = {}
for row in training_info.iterrows():
    map_mid_to_date[row[1].mid] = row[1].date
    
for row in test_info.iterrows():
    map_mid_to_date[row[1].mid] = row[1].date

map_mid_to_sender = {}
for d in training.apply(lambda x: {int(mid): x.sender for mid in x.mids.split(' ')}, axis=1).values:
    map_mid_to_sender.update(d)
    
map_mid_to_receiver = {}
for row in training_info.iterrows():
    map_mid_to_receiver[row[1].mid] = row[1].recipients_valid

map_sender_to_mid = {}
for sender in training.sender:
    map_sender_to_mid[sender] = training[training.sender==sender].mids.values[0].split(' ')
    
map_sender_to_mid_w_date = defaultdict(list)
for sender in training.sender:
    for mid in map_sender_to_mid[sender]:
        map_sender_to_mid_w_date[sender].append((mid, map_mid_to_date[int(mid)]))
        
for recipient in map_sender_to_mid_w_date.keys():
    map_sender_to_mid_w_date[recipient] = sorted(map_sender_to_mid_w_date[recipient], key=lambda x: x[1])

In [116]:
NB_SPLIT = 5
mids_train_test = []
for i in range(NB_SPLIT):
    new_split = { 'train': [], 'test': [] }
    for recipient in map_sender_to_mid_w_date.keys():
        idx_lim_inf = (NB_SPLIT - i)*10
        
        idx_lim_sup = (i - (NB_SPLIT-1))*10 if ((NB_SPLIT-1) - i)*10 != 0 else None
        new_split['train'] += [x[0] for x in map_sender_to_mid_w_date[recipient][:-idx_lim_inf]]
        new_split['test'] += [x[0] for x in map_sender_to_mid_w_date[recipient][-idx_lim_inf:idx_lim_sup]]
    mids_train_test.append(new_split)

#### Body preprocessing

In [117]:
training_info['body_normalized'] = np.array(normalize_corpus(training_info.body, lemmatize=True))

In [118]:
test_info['body_normalized'] = np.array(normalize_corpus(test_info.body, lemmatize=True))

In [119]:
training_info.to_csv('./training_info_df.csv', encoding='utf-8')
test_info.to_csv('./test_info_df.csv', encoding='utf-8')

In [120]:
#training_info = pd.read_csv('./training_info_df.csv', encoding='utf-8')
#training_info.drop(['Unnamed: 0'], axis=1, inplace=True)
#training_info.mid = training_info.mid.map(lambda x: int(x))
#training_info.body_normalized = training_info.body_normalized.map(lambda x: '' if type(x)==float else x)
#training_info['recipients_valid'] = training_info.recipients.map(fix_mail_list)
#test_info = pd.read_csv('./test_info_df.csv', encoding='utf-8')
#test_info.drop(['Unnamed: 0'], axis=1, inplace=True)
#test_info.mid = test_info.mid.map(lambda x: int(x))
#test_info.body_normalized = test_info.body_normalized.map(lambda x: '' if type(x)==float else x)

In [121]:
test_info.head()

Unnamed: 0,mid,date,body,date_simplified,body_normalized
0,204747,2001-11-02 07:47:19,GWF ???,2001-11-02,gwf
1,82354,2001-11-02 06:17:44,WOW.... I am positive that your beautiful wife...,2001-11-02,wow positive beautiful wife sign haul rug rat ...
2,101099,2001-11-02 14:31:15,The following name overlay was completed in GC...,2001-11-02,following name overlay complete gcp today cp i...
3,160740,2001-11-02 10:44:50,"-----Original Message-----From: Legler, Micha...",2001-11-02,original message legler michael j send tuesday...
4,200363,2001-11-02 11:12:37,"-----Original Message-----From: \tVan houten,...",2001-11-02,original message van houten maria send friday ...


#### Pleins de trucs qui servent à calculer BM25 et construire les features textuels

In [122]:
tfidf_vectorizer, tfidf_features = build_feature_matrix(training_info['body_normalized'],
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1), 
                                                        min_df=3, max_df=1.0)
query_docs_tfidf = tfidf_vectorizer.transform(test_info['body_normalized'])
vectorizer, corpus_features = build_feature_matrix(training_info['body_normalized'],
                                                   feature_type='frequency')
query_docs_features = vectorizer.transform(test_info['body_normalized'])
doc_lengths = [len(doc.split()) for doc in training_info['body_normalized']]   
avg_dl = np.average(doc_lengths)
corpus_term_idfs = compute_corpus_term_idfs(corpus_features, training_info['body_normalized'])

#### Address book creation

In [123]:
handbook_train = create_handbook(set_=training, set_info=training_info)

#### Features temporels

In [124]:
def get_last_message_from_u_to_c(u, c, t):
    previous_mails = get_messages_written_by_u_to_c(u=u, c=c, t_final=t, t_final_include=False)
    return map_mid_to_date[previous_mails[-1]] if list(previous_mails) else None

In [125]:
def outgoing_message_percentage(u, c, t):
    try:
        return len(get_messages_written_by_u_to_c(u=u, c=c, t_init=None, t_final=t, t_final_include=False))/\
               len(get_messages_written_by_u_to_c(u=u, c=None, t_init=None, t_final=t, t_final_include=False))
    except:
        return 0

In [126]:
def incoming_message_percentage(u, c, t):
    try:
        return len(get_messages_written_by_u_to_c(u=c, c=u, t_init=None, t_final=t, t_final_include=False))/\
               len(get_messages_written_by_u_to_c(u=None, c=u, t_init=None, t_final=t, t_final_include=False))
    except:
        return 0

In [127]:
def more_recent_outgoing_percentage(u, c, t, alpha=2):
    t_init = get_last_message_from_u_to_c(u=u, c=c, t=t)
    try: 
        return (1./alpha)*len(get_messages_written_by_u_to_c(u=u, c=None, t_init=t_init, t_final=t, t_final_include=False))/\
                          len(get_messages_written_by_u_to_c(u=u, c=None, t_init=0, t_final=t, t_final_include=False))
    except:
        return 1

In [128]:
def more_recent_incoming_percentage(u, c, t, alpha=2):
    t_init = get_last_message_from_u_to_c(u=c, c=u, t=t)
    try:
        return (1./alpha)*len(get_messages_written_by_u_to_c(u=None, c=u, t_init=t_init, t_final=t, t_final_include=False))/\
                          len(get_messages_written_by_u_to_c(u=None, c=u, t_init=0, t_final=t, t_final_include=False))
    except:
        return 1

#### We look for most similar email body for each email in test and training set

In [129]:
similar_docs = []

# Log the process
start_time = time()
finish_cursor = len(test_info['body_normalized'])

for index, doc in enumerate(test_info['body_normalized']):
    
    doc_features = query_docs_features[index]
    top_similar_docs = compute_bm25_similarity(doc_features,
                                               corpus_features,
                                               doc_lengths,
                                               avg_dl,
                                               corpus_term_idfs,
                                               k1=1.5, b=0.75,
                                               top_n=30)
        
    log_process(cursor=index, finish_cursor=finish_cursor, start_time=start_time)

    similar_docs.append(top_similar_docs)
    
    if index % 10 == 0 or index == finish_cursor - 1:
        similarity = pd.DataFrame(similar_docs)
        similarity.to_csv('./similarity_test_indo.csv', sep='\t')

In [130]:
query_docs_tfidf_train = tfidf_vectorizer.transform(training_info['body_normalized'])
query_docs_features_train = vectorizer.transform(training_info['body_normalized'])

In [131]:
similar_docs_train = []
# Log the process
start_time = time()
finish_cursor = len(training_info['body_normalized'])
for index, doc in enumerate(training_info['body_normalized']):
    
    
    doc_features = query_docs_features_train[index]
    top_similar_docs = compute_bm25_similarity(doc_features,
                                               corpus_features,
                                               doc_lengths,
                                               avg_dl,
                                               corpus_term_idfs,
                                               k1=1.5, b=0.75,
                                               top_n=31)
        
    log_process(cursor=index, finish_cursor=finish_cursor, start_time=start_time)

    similar_docs_train.append(top_similar_docs)
    
    if index % 500 == 0 or index == finish_cursor - 1:
        similarity_train = pd.DataFrame(similar_docs_train)
        similarity_train.to_csv('./similarity_train_indo.csv', sep='\t')

In [132]:
similarity_train.drop([0], axis=1, inplace=True)

In [133]:
#similarity_train = pd.read_csv('./similarity_train_indo.csv', sep='\t')
#similarity_train.drop(['0', 'Unnamed: 0'], axis=1, inplace=True)
#similarity_train = similarity_train.applymap(lambda a: tuple(int(x) if idx==0 else float(x) for idx, x in enumerate(a[1:-1].split(', '))))
#similarity = pd.read_csv('./similarity_test_indo.csv', sep='\t')
#similarity.drop(['Unnamed: 0'], axis=1, inplace=True)
#similarity = similarity.applymap(lambda a: tuple(int(x) if idx==0 else float(x) for idx, x in enumerate(a[1:-1].split(', '))))

In [134]:
similarity_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
0,"(14009, 92.424)","(13663, 85.452)","(4883, 81.997)","(4157, 79.816)","(5544, 79.572)","(1859, 78.309)","(4882, 77.303)","(2364, 76.558)","(1272, 75.967)","(26562, 75.124)",...,"(13844, 69.783)","(3833, 69.748)","(5177, 69.363)","(11049, 69.116)","(9419, 68.837)","(10850, 68.252)","(13829, 68.236)","(3995, 68.149)","(9467, 67.809)","(4200, 67.661)"
1,"(20819, 95.252)","(11766, 95.238)","(11749, 95.238)","(33675, 95.235)","(24010, 94.556)","(24021, 94.556)","(29305, 92.853)","(29276, 92.853)","(35980, 91.968)","(35974, 91.968)",...,"(27220, 88.295)","(36537, 87.929)","(36664, 87.929)","(36583, 87.929)","(36498, 87.929)","(2544, 87.303)","(26415, 87.25)","(19550, 87.227)","(17111, 86.448)","(41039, 86.423)"
2,"(351, 128.683)","(4513, 126.78)","(17724, 125.077)","(17787, 125.077)","(615, 122.979)","(17522, 121.551)","(17500, 121.551)","(3872, 119.23)","(3875, 119.23)","(613, 116.276)",...,"(33380, 105.929)","(33457, 105.929)","(16835, 103.087)","(33393, 101.171)","(33442, 101.171)","(34651, 99.952)","(34643, 99.952)","(16206, 99.097)","(16289, 99.097)","(731, 98.961)"
3,"(4, 391.578)","(5689, 164.238)","(5676, 164.238)","(4189, 115.327)","(4161, 115.327)","(435, 102.455)","(85, 97.22)","(84, 97.22)","(18955, 89.227)","(28653, 87.428)",...,"(3087, 61.505)","(4958, 59.211)","(750, 55.763)","(753, 55.763)","(16289, 53.046)","(16206, 53.046)","(88, 52.441)","(89, 52.441)","(5230, 50.344)","(17500, 50.14)"
4,"(4, 391.578)","(5689, 164.238)","(5676, 164.238)","(4189, 115.327)","(4161, 115.327)","(435, 102.455)","(85, 97.22)","(84, 97.22)","(18955, 89.227)","(28653, 87.428)",...,"(3087, 61.505)","(4958, 59.211)","(750, 55.763)","(753, 55.763)","(16289, 53.046)","(16206, 53.046)","(88, 52.441)","(89, 52.441)","(5230, 50.344)","(17500, 50.14)"


In [135]:
map_mid_to_sim_train = {}
for index, mid in enumerate(training_info['mid']):
    map_mid_to_sim_train[mid] = filter(lambda x: x[1] > 0, similarity_train.loc[index].values[:30])

In [136]:
map_mid_to_sim = {}
for index, mid in enumerate(test_info['mid']):
    map_mid_to_sim[mid] = filter(lambda x: x[1] > 0, similarity.loc[index].values[:30])

#### Features textuels

In [188]:
def outgoing_textual_similarity(mid, c):
    most_sim_ids = map(lambda x: x[0], map_mid_to_sim[int(mid)])
    for idx_mail in most_sim_ids:
        most_sim_id = training_info.iloc[idx_mail].mid
        if c in map_mid_to_receiver[int(most_sim_id)]:
            return 1
    return -1

In [189]:
def incoming_textual_similarity(mid, c):
    most_sim_ids = map(lambda x: x[0], map_mid_to_sim[int(mid)])
    for idx_mail in most_sim_ids:
        most_sim_id = training_info.iloc[idx_mail].mid
        if c == map_mid_to_sender[int(most_sim_id)]:
            return 1
    return -1

In [140]:
def outgoing_textual_similarity_train(mid, c):
    most_sim_ids = map(lambda x: x[0], map_mid_to_sim_train[int(mid)])
    for idx_mail in most_sim_ids:
        most_sim_id = training_info.iloc[idx_mail].mid
        if c in map_mid_to_receiver[int(most_sim_id)]:
            return 1
    return -1

In [141]:
def incoming_textual_similarity_train(mid, c):
    most_sim_ids = map(lambda x: x[0], map_mid_to_sim_train[int(mid)])
    for idx_mail in most_sim_ids:
        most_sim_id = training_info.iloc[idx_mail].mid
        if c == map_mid_to_sender[int(most_sim_id)]:
            return 1
    return -1

In [142]:
def get_names(mail_address):
    match = re.findall(r'(?:([a-z]+)\.([a-z]+)|([a-z]+)\.\.([a-z]+)|([a-z]+))@[a-z]+\.[a-z]+', mail_address)
    if match:
        return [elmt.lower() for elmt in match[0] if elmt and len(elmt) > 2] 
    return None
    
def is_name_in_mail(mail, names):
    if not names:
        return -1
    return 1 if sum([name in mail for name in names]) else -1

def is_name_in_mail_short(mail, names):
    if not names:
        return -1
    return 1 if sum([name in mail[:30] for name in names]) else -1

In [46]:
counter_email_per_u = defaultdict(int)
counter_from_u_to_c = defaultdict(lambda: defaultdict(int))
for sender in training.sender:
    mids_sender = training[training.sender==sender].mids.values[0].split(' ')
    for mid in mids_sender:
        counter_email_per_u[sender] += 1
        for recipient in map_mid_to_receiver[int(mid)]:
            counter_from_u_to_c[sender][recipient] += 1

In [47]:
freq_from_u_to_c = defaultdict(lambda: defaultdict(float))
for sender in training.sender:
    for recipient in counter_from_u_to_c[sender].keys():
        freq_from_u_to_c[sender][recipient] = counter_from_u_to_c[sender][recipient]/counter_email_per_u[sender]

In [162]:
#train_df_w_features = pd.read_csv('./train___.csv', sep='\t')
#test_df_w_features  = pd.read_csv('./test___.csv', sep='\t')

In [163]:
#train_df_w_features = train_df_w_features.drop(['Unnamed: 0'], axis=1)
#test_df_w_features = test_df_w_features.drop(['Unnamed: 0'], axis=1)
train_df_w_features.mid = train_df_w_features.mid.map(lambda mid: int(mid))
test_df_w_features.mid = test_df_w_features.mid.map(lambda mid: int(mid))

In [164]:
train_df_w_features.head()

Unnamed: 0,outgoing_message_percentage,incoming_message_percentage,more_recent_outgoing_percentage,more_recent_incoming_percentage,outgoing_textual_similarity,incoming_textual_similarity,is_name_in_mail,is_name_in_mail_short_,recipient,mid,sender,result
0,0.0,0.0,1.0,0.5,1.0,-1.0,1.0,1.0,jason.wolfe@enron.com,158713,karen.buckley@enron.com,1.0
1,0.0,0.0,1.0,0.5,-1.0,-1.0,-1.0,-1.0,hicham.benjelloun@enron.com,158713,karen.buckley@enron.com,-1.0
2,0.0,0.0,0.5,0.5,1.0,-1.0,-1.0,-1.0,elizabeth.shim@enron.com,158697,karen.buckley@enron.com,1.0
3,0.0,0.0,0.5,0.5,-1.0,-1.0,-1.0,-1.0,elizabeth.johnston@enron.com,158697,karen.buckley@enron.com,-1.0
4,0.0,0.0,0.5,0.5,1.0,-1.0,-1.0,-1.0,russell.ballato@enron.com,158697,karen.buckley@enron.com,1.0


In [183]:
test_df_w_features.head()

Unnamed: 0,outgoing_message_percentage,incoming_message_percentage,more_recent_outgoing_percentage,more_recent_incoming_percentage,outgoing_textual_similarity,incoming_textual_similarity,is_name_in_mail,is_name_in_mail_short_,recipient,mid,sender
0,0.00641,0.0,0.471154,0.5,-1.0,-1.0,1.0,-1.0,karen.herrmann@enron.com,298389,karen.buckley@enron.com
1,0.00641,0.0,0.471154,0.5,-1.0,-1.0,-1.0,-1.0,jennifer.mcquade@enron.com,298389,karen.buckley@enron.com
2,0.00641,0.0,0.471154,0.5,-1.0,-1.0,1.0,1.0,john.hudson@enron.com,298389,karen.buckley@enron.com
3,0.00641,0.0,0.471154,0.5,-1.0,-1.0,-1.0,-1.0,reginald.smith@enron.com,298389,karen.buckley@enron.com
4,0.00641,0.0,0.471154,0.5,-1.0,-1.0,-1.0,-1.0,malley@enron.com,298389,karen.buckley@enron.com


#### Features matrix creation for training set

In [148]:
def generate_negative(recipient_list, sender):
    recipient_set = set(recipient_list)
    candidates_negative = handbook_train[sender].difference(recipient_set)
    return random.sample(candidates_negative, min(len(recipient_set), len(candidates_negative)))

def create_data_train(row, sender):
    out = []
    recipient_list = row.recipients_valid
    negative_recipients_list = generate_negative(recipient_list, sender)
    for idx, recipient in enumerate(recipient_list):
        # Positive
        outgoing_message_percentage_ = outgoing_message_percentage(u=sender, c=recipient, t=row.date)
        incoming_message_percentage_ = incoming_message_percentage(u=sender, c=recipient, t=row.date)
        more_recent_outgoing_percentage_ = more_recent_outgoing_percentage(u=sender, c=recipient, t=row.date)
        more_recent_incoming_percentage_ = more_recent_incoming_percentage(u=sender, c=recipient, t=row.date)
        outgoing_textual_similarity_ = outgoing_textual_similarity_train(c=recipient, mid=row.mid) 
        incoming_textual_similarity_ = incoming_textual_similarity_train(c=recipient, mid=row.mid) 
        is_name_in_mail_short_ = is_name_in_mail_short(row.body_normalized, get_names(recipient))
        
        result = 1
        
        out.append([
            outgoing_message_percentage_,
            incoming_message_percentage_,
            more_recent_outgoing_percentage_,
            more_recent_incoming_percentage_,
            outgoing_textual_similarity_,
            incoming_textual_similarity_,
            is_name_in_mail_short_,
            recipient,
            row.mid,
            sender,
            result
        ])
        if idx < len(negative_recipients_list):
            # Negative
            recipient = negative_recipients_list[idx]
            outgoing_message_percentage_ = outgoing_message_percentage(u=sender, c=recipient, t=row.date)
            incoming_message_percentage_ = incoming_message_percentage(u=sender, c=recipient, t=row.date)
            more_recent_outgoing_percentage_ = more_recent_outgoing_percentage(u=sender, c=recipient, t=row.date)
            more_recent_incoming_percentage_ = more_recent_incoming_percentage(u=sender, c=recipient, t=row.date)
            outgoing_textual_similarity_ = outgoing_textual_similarity_train(c=recipient, mid=row.mid) 
            incoming_textual_similarity_ = incoming_textual_similarity_train(c=recipient, mid=row.mid) 
            is_name_in_mail_short_ = is_name_in_mail_short(row.body_normalized, get_names(recipient))
            
            result = -1

            out.append([
                outgoing_message_percentage_,
                incoming_message_percentage_,
                more_recent_outgoing_percentage_,
                more_recent_incoming_percentage_,
                outgoing_textual_similarity_,
                incoming_textual_similarity_,
                is_name_in_mail_short_,
                recipient,
                row.mid,
                sender,
                result
            ])
            
    return out

In [149]:
train_df_w_features = pd.DataFrame([], columns=['outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_textual_similarity', 'incoming_textual_similarity', 'is_name_in_mail', 'recipient', 'mid', 'sender', 'result'])
for idx_sender in range(training.shape[0]):
    print("%d - On considère: %s" %(idx_sender, sender))
    sender = training.sender[idx_sender]
    print("On considère: %s" %sender)
    mids_sent = get_messages_written_by_u_to_c(u=sender, set_=training, set_info=training_info)
    res = []
    start_time = time()
    finish_cursor = training_info[training_info.mid.isin(mids_sent)].shape[0]
    for idx, row in enumerate(training_info[training_info.mid.isin(mids_sent)].iterrows()):
        res += create_data_train(row[1], sender)
        log_process(cursor=idx, finish_cursor=finish_cursor, start_time=start_time)
    train_df_w_features = pd.concat((train_df_w_features, pd.DataFrame(res, columns=['outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_textual_similarity', 'incoming_textual_similarity', 'is_name_in_mail', 'recipient', 'mid', 'sender', 'result']))) 
    train_df_w_features.to_csv('train_df_w_features_FINAL', sep='\t')
    print

0 - On considère: karen.buckley@enron.com
On considère: karen.buckley@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
1 - On considère: karen.buckley@enron.com
On considère: amr.ibrahim@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
2 - On considère: amr.ibrahim@enron.com
On considère: andrea.ring@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
3 - On considère: andrea.ring@enron.com
On considère: sylvia.hu@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
4 - On considère: sylvia.hu@enron.com
On considère: phillip.platter@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
5 - On considère: phillip.platter@enron.com
On considère: richard.shapiro@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
6 - On considère: richard.shapiro@enron.com
On considère: megan.parker@enron.com
100.00% ----- Temps restant estimé: 0 min 0 sec -----
7 - On considère: megan.parker@enron.com
On considère: david.forster@en

#### Features matrix creation for test set

In [237]:
def create_data_test(row, sender):
    out = []
    for recipient in handbook_train[sender]:
        outgoing_message_percentage_ = outgoing_message_percentage(u=sender, c=recipient, t=row.date)
        incoming_message_percentage_ = incoming_message_percentage(u=sender, c=recipient, t=row.date)
        more_recent_outgoing_percentage_ = more_recent_outgoing_percentage(u=sender, c=recipient, t=row.date)
        more_recent_incoming_percentage_ = more_recent_incoming_percentage(u=sender, c=recipient, t=row.date)
        outgoing_textual_similarity_ = outgoing_textual_similarity(c=recipient, mid=row.mid) 
        incoming_textual_similarity_ = incoming_textual_similarity(c=recipient, mid=row.mid) 
        is_name_in_mail_ = is_name_in_mail_short(row.body_normalized, get_names(recipient))
        
        out.append([
            outgoing_message_percentage_,
            incoming_message_percentage_,
            more_recent_outgoing_percentage_,
            more_recent_incoming_percentage_,
            outgoing_textual_similarity_,
            incoming_textual_similarity_,
            is_name_in_mail_,
            recipient,
            row.mid,
            sender
        ])

    return out

In [None]:
test_df_w_features = pd.DataFrame([], columns=['outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_textual_similarity', 'incoming_textual_similarity', 'is_name_in_mail', 'recipient', 'mid', 'sender'])
for idx_sender in range(test.shape[0]):
    sender = test.sender[idx_sender]
    print("%d - On considère: %s" %(idx_sender, sender))
    mids_sent = get_messages_written_by_u_to_c(u=sender, set_=test, set_info=test_info)
    res = []
    start_time = time()
    finish_cursor = test_info[test_info.mid.isin(mids_sent)].shape[0]
    for idx, row in enumerate(test_info[test_info.mid.isin(mids_sent)].iterrows()):
        res += create_data_test(row[1], sender)
        log_process(cursor=idx, finish_cursor=finish_cursor, start_time=start_time)
    test_df_w_features = pd.concat((test_df_w_features, pd.DataFrame(res, columns=['outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_message_percentage', 'incoming_message_percentage', 'more_recent_outgoing_percentage', 'more_recent_incoming_percentage', 'outgoing_textual_similarity', 'incoming_textual_similarity', 'is_name_in_mail', 'recipient', 'mid', 'sender']))) 
    test_df_w_features.to_csv('test_df_w_features_FINAL', sep='\t')
    print

#### We train the model

In [199]:
rf = MLPClassifier(hidden_layer_sizes=(100,), alpha=.0001)

data_train = pd.get_dummies(train_df_w_features, columns=['sender'])
data_test = pd.get_dummies(test_df_w_features, columns=['sender'])

X = data_train.drop(['recipient', 'mid', 'result'], axis=1).values
y = data_train[['result']].values.flatten()
info = data_train[['recipient', 'mid', 'result']].values

rf.fit(X, y)

X_test = data_test.drop(['recipient', 'mid'], axis=1).values
info_test = data_test[['recipient', 'mid']].values

best_recipients = {}
mids = set(map(int, info_test[:,1]))
conc = np.concatenate((info_test, rf.predict_proba(X_test)[:,1,np.newaxis]), axis=1)
conc_df = pd.DataFrame(conc, columns=['recipient', 'mid', 'proba'])
for mid in mids:
    best_recipients[mid] = conc_df[conc_df.mid==mid].sort_values('proba', ascending=False).recipient[:10].tolist()

In [200]:
path_to_results = './'
with open(path_to_results + 'predictions.txt', 'w') as my_file:
    my_file.write('mid,recipients' + '\n')
    for mid, my_preds in best_recipients.items():
        my_file.write(str(mid) + ',' + ' '.join(my_preds) + '\n')