In [1]:
import random
import operator
import pandas as pd
from collections import Counter

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


In [2]:
path_to_data = '/Users/patrickbordes/Desktop/MVA/ALTEGRAD/projet/';

##########################
# load some of the files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

In [3]:
################################
# create some handy structures #                    
################################

# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

In [4]:
emails_ids_per_sender

{'alan.aronowitz@enron.com': ['183093',
  '31100',
  '31101',
  '31102',
  '31103',
  '31104',
  '31105',
  '31106',
  '31107',
  '17901',
  '323367',
  '12415',
  '12416',
  '12419',
  '12391',
  '12491',
  '12387',
  '17762',
  '12348',
  '106179',
  '369332',
  '12072',
  '184580',
  '186525',
  '12046',
  '12026',
  '12015',
  '11993',
  '11997',
  '11990',
  '11978',
  '21175',
  '11909',
  '21119',
  '11868',
  '21084',
  '184766',
  '184780',
  '11618',
  '11578',
  '11589',
  '20901',
  '20909',
  '110686',
  '11516',
  '11518',
  '11521',
  '20862',
  '20863',
  '20866',
  '11496',
  '20848',
  '110519',
  '110486',
  '11425',
  '11426',
  '11427',
  '20793',
  '20794',
  '20795',
  '11416',
  '11418',
  '20785',
  '20787',
  '110337',
  '110291',
  '110234',
  '110186',
  '11175',
  '11192',
  '11193',
  '20580',
  '20595',
  '20596',
  '185158',
  '185161',
  '110012',
  '109943',
  '174595',
  '185320',
  '185322',
  '185323',
  '185410',
  '185416',
  '11056',
  '20488',
 

In [5]:
all_senders

dict_keys(['mike.grigsby@enron.com', 'w..cantrell@enron.com', 'michael.tribolet@enron.com', 'christina.valdez@enron.com', 'mark.mcconnell@enron.com', 'hunter.s.shively@enron.com', 'susan.scott@enron.com', 'andy.zipper@enron.com', 'phillip.platter@enron.com', 'darrell.schoolcraft@enron.com', 'mike.carson@enron.com', 'richard.shapiro@enron.com', 'jim.schwieger@enron.com', 'peter.keohane@enron.com', 'mike.maggi@enron.com', 'alan.comnes@enron.com', 'monika.causholli@enron.com', 'vkaminski@aol.com', 'rick.buy@enron.com', 'jonathan.mckay@enron.com', 'amr.ibrahim@enron.com', 'scott.neal@enron.com', 'm..schmidt@enron.com', 'sandra.f.brawner@enron.com', 'larry.f.campbell@enron.com', 'rahil.jafry@enron.com', 'paul.d.thomas@enron.com', 'joannie.williamson@enron.com', 'lorna.brennan@enron.com', 'david.port@enron.com', 'brad.mckay@enron.com', 'jason.williams@enron.com', 'keegan.farrell@enron.com', 'dutch.quigley@enron.com', 'greg.piper@enron.com', 'grace.rodriguez@enron.com', 'karen.denne@enron.com

In [6]:
# create address book with frequency information for each user
address_books = {}
i = 0

for sender, ids in emails_ids_per_sender.items():
    recs_temp = []
    for my_id in ids:
        recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
        recipients = recipients[0].split(' ')
        # keep only legitimate email addresses
        recipients = [rec for rec in recipients if '@' in rec]
        recs_temp.append(recipients)
    # flatten    
    recs_temp = [elt for sublist in recs_temp for elt in sublist]
    # compute recipient counts
    rec_occ = dict(Counter(recs_temp))
    # order by frequency
    sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse = True)
    # save
    address_books[sender] = sorted_rec_occ
    
    if i % 10 == 0:
        print(i);
    i += 1

# save all unique recipient names    
all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))

# save all unique user names 
all_users = []
all_users.extend(all_senders)
all_users.extend(all_recs)
all_users = list(set(all_users))

#############
# baselines #                           
#############

# will contain email ids, predictions for random baseline, and predictions for frequency baseline
predictions_per_sender = {}

# number of recipients to predict
k = 10

for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = name_ids[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    random_preds = []
    freq_preds = []
    # select k most frequent recipients for the user
    k_most = [elt[0] for elt in address_books[sender][:k]]
    for id_predict in ids_predict:
        # select k users at random
        random_preds.append(random.sample(all_users, k))
        # for the frequency baseline, the predictions are always the same
        freq_preds.append(k_most)
    predictions_per_sender[sender] = [ids_predict,random_preds,freq_preds]	

0
10
20
30
40
50
60
70
80
90
100
110
120


In [7]:
ids_predict

[79366,
 93440,
 392058,
 79469,
 391878,
 27394,
 79299,
 79487,
 50379,
 50386,
 31924,
 31928,
 394431,
 394230,
 382224,
 48500,
 27375,
 50334,
 79227,
 394198]

In [8]:
predictions_per_sender

{'alan.aronowitz@enron.com': [[235059,
   171916,
   177414,
   326026,
   176903,
   117898,
   172208,
   176882,
   37361,
   178340,
   176866,
   101059,
   176935,
   176655,
   176889,
   176877,
   377985,
   326070,
   117786,
   172288],
  [['rose.m.prevott@williams.com',
    'adam.turner@enron.com',
    'davef@abag.ca.gov',
    'jackie.young@enron.com',
    'faith.reid@enron.com',
    'ray.alvarez@enron.com',
    'christina.finelli@enron.com',
    'scarey@isda.org',
    'jackie.gentle@enron.com',
    'jay.hawthorn@enron.com'],
   ['ews.report@enron.com',
    'mlozano@enron.com',
    'mark.greenberg@enron.com',
    'carolyn.perry@enron.com',
    'hudgins1@worldnet.att.net',
    'cheryl.kuehl@enron.com',
    'dana.gibbs@enron.com',
    'rmwongozi@hotmail.com',
    'kristen.oland@enform.com',
    'loretta.brelsford@enron.com'],
   ['brandi.wachtendorf@enron.com',
    'cferguson@pcm.net',
    'pkdaigle@neosoft.com',
    'tim.johanson@enron.com',
    'mac.mcclelland@enron.com',
 

In [9]:
all_recs

['chrisr@synergyog.com',
 'emmacaplan@bootsnall.net',
 'reyna.cabrera@enron.com',
 'ann.vaughn@enron.com',
 'ajafry@hotmail.com',
 'jhinson@austinrr.com',
 'm..landwehr@enron.com',
 'robin@risk.co.uk',
 'tim.f.riordan@conoco.com',
 'richard.jordan@swgas.com',
 'tregtremont@dwt.com',
 'howard.carter@enron.com',
 'pgg4@pge.com',
 'jwelder@kleberg.com',
 'bhansen@lhom.com',
 'amcmullen@canscot.com',
 'lwolf@csc.com',
 'soblander@carrfut.com',
 'jeffrey.hodge@enron.com',
 'jeremy.mcfaddin@enron.com',
 'kelli.stevens@hotmail.com',
 'pam.benson@enron.com',
 'kevin.delafield@enron.com',
 'sophie.patel@enron.com',
 'john.zurita@enron.com',
 'tara.sweitzer@enron.com',
 'chris_prejean@perrygas.com',
 'kelly.carrington@enron.com',
 'jean.ryall@enron.com',
 'john.norden@enron.com',
 'phil.clifford@enron.com',
 'chouda@nmh.org',
 'todd.perry@enron.com',
 'bkanatzar@mail.utexas.edu',
 'mary.coombe@enron.com',
 'mac@ifeminists.com',
 'sunita.cooke@nhmccd.edu',
 'bafjjfjefmbfzef@cs.com',
 'mary.rucci@

In [10]:
address_books

{'alan.aronowitz@enron.com': [('mark.e.taylor@enron.com', 81),
  ('sara.shackleton@enron.com', 40),
  ('e..haedicke@enron.com', 32),
  ('john.viverito@enron.com', 23),
  ('tana.jones@enron.com', 22),
  ('jane.mcbride@enron.com', 19),
  ('david.minns@enron.com', 14),
  ('harry.collins@enron.com', 13),
  ('richard.b.sanders@enron.com', 11),
  ('matthias.lee@enron.com', 11),
  ('marcus.nettelton@enron.com', 11),
  ('jeffrey.hodge@enron.com', 10),
  ('wayne.gresham@enron.com', 10),
  ('justin.boyd@enron.com', 10),
  ('daniel.rogers@enron.com', 10),
  ('taylor@enron.com', 10),
  ('nita.garcia@enron.com', 9),
  ('jeffrey.a.shankman@enron.com', 9),
  ('larry.lawyer@enron.com', 9),
  ('mike.mcconnell@enron.com', 9),
  ('sheila.glover@enron.com', 8),
  ('travis.mccullough@enron.com', 8),
  ('janette.elbertson@enron.com', 8),
  ('nony.flores@enron.com', 8),
  ('greg.whalley@enron.com', 7),
  ('brent.hendry@enron.com', 7),
  ('laurie.mayer@enron.com', 7),
  ('john.suttle@enron.com', 7),
  ('edmun

In [11]:
all_recs

['chrisr@synergyog.com',
 'emmacaplan@bootsnall.net',
 'reyna.cabrera@enron.com',
 'ann.vaughn@enron.com',
 'ajafry@hotmail.com',
 'jhinson@austinrr.com',
 'm..landwehr@enron.com',
 'robin@risk.co.uk',
 'tim.f.riordan@conoco.com',
 'richard.jordan@swgas.com',
 'tregtremont@dwt.com',
 'howard.carter@enron.com',
 'pgg4@pge.com',
 'jwelder@kleberg.com',
 'bhansen@lhom.com',
 'amcmullen@canscot.com',
 'lwolf@csc.com',
 'soblander@carrfut.com',
 'jeffrey.hodge@enron.com',
 'jeremy.mcfaddin@enron.com',
 'kelli.stevens@hotmail.com',
 'pam.benson@enron.com',
 'kevin.delafield@enron.com',
 'sophie.patel@enron.com',
 'john.zurita@enron.com',
 'tara.sweitzer@enron.com',
 'chris_prejean@perrygas.com',
 'kelly.carrington@enron.com',
 'jean.ryall@enron.com',
 'john.norden@enron.com',
 'phil.clifford@enron.com',
 'chouda@nmh.org',
 'todd.perry@enron.com',
 'bkanatzar@mail.utexas.edu',
 'mary.coombe@enron.com',
 'mac@ifeminists.com',
 'sunita.cooke@nhmccd.edu',
 'bafjjfjefmbfzef@cs.com',
 'mary.rucci@

In [12]:
all_users

['chrisr@synergyog.com',
 'emmacaplan@bootsnall.net',
 'reyna.cabrera@enron.com',
 'ann.vaughn@enron.com',
 'ajafry@hotmail.com',
 'jhinson@austinrr.com',
 'm..landwehr@enron.com',
 'robin@risk.co.uk',
 'tim.f.riordan@conoco.com',
 'richard.jordan@swgas.com',
 'tregtremont@dwt.com',
 'howard.carter@enron.com',
 'pgg4@pge.com',
 'jwelder@kleberg.com',
 'bhansen@lhom.com',
 'amcmullen@canscot.com',
 'lwolf@csc.com',
 'soblander@carrfut.com',
 'jeffrey.hodge@enron.com',
 'jeremy.mcfaddin@enron.com',
 'kelli.stevens@hotmail.com',
 'pam.benson@enron.com',
 'kevin.delafield@enron.com',
 'sophie.patel@enron.com',
 'john.zurita@enron.com',
 'tara.sweitzer@enron.com',
 'chris_prejean@perrygas.com',
 'kelly.carrington@enron.com',
 'jean.ryall@enron.com',
 'john.norden@enron.com',
 'phil.clifford@enron.com',
 'chouda@nmh.org',
 'todd.perry@enron.com',
 'bkanatzar@mail.utexas.edu',
 'mary.coombe@enron.com',
 'mac@ifeminists.com',
 'sunita.cooke@nhmccd.edu',
 'bafjjfjefmbfzef@cs.com',
 'mary.rucci@

In [13]:
#################################################
# write predictions in proper format for Kaggle #
#################################################

path_to_results = '/Users/patrickbordes/Desktop/MVA/ALTEGRAD/projet/';

with open(path_to_results + 'predictions_random.txt', 'w') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.items():
        ids = preds[0]
        random_preds = preds[1]
        for index, my_preds in enumerate(random_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

with open(path_to_results + 'predictions_frequency.txt', 'w') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.items():
        ids = preds[0]
        freq_preds = preds[2]
        for index, my_preds in enumerate(freq_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')