In [1]:
import pandas as pd

In [2]:
# define file name
data_folder_path = "event-recommendation-engine-challenge"
event_attendees_file_name = data_folder_path + "/event_attendees.csv"
events_file_name = data_folder_path + "/events.csv"
test_file_name = data_folder_path + "/test.csv"
train_file_name = data_folder_path + "/train.csv"
users_file_name = data_folder_path + "/users.csv"
user_friends_file_name = data_folder_path + "/user_friends.csv"

In [3]:
# read data
print("Reading files ...")
try:
    print(event_attendees_file_name + " ... ", end = "")
    event_attendees = pd.read_csv(event_attendees_file_name, nrows=1000)
    print("finished (1/6)")
    print(events_file_name + " ... ", end = "")
    events = pd.read_csv(events_file_name, nrows=1000)
    print("finished (2/6)")
    print(test_file_name + " ... ", end = "")
    test = pd.read_csv(test_file_name, nrows=1000)
    print("finished (3/6)")
    print(train_file_name + " ... ", end = "")
    train = pd.read_csv(train_file_name, nrows=1000)
    print("finished (4/6)")
    print(users_file_name + " ... ", end = "")
    users = pd.read_csv(users_file_name, nrows=1000)
    print("finished (5/6)")
    print(user_friends_file_name + " ... ", end = "")
    user_friends = pd.read_csv(user_friends_file_name, nrows=1000)
    print("finished (6/6)")
    print("File loading completed!")
except FileNotFoundError as fnf_error:
    print(fnf_error)

Reading files ...
event-recommendation-engine-challenge/event_attendees.csv ... finished (1/6)
event-recommendation-engine-challenge/events.csv ... finished (2/6)
event-recommendation-engine-challenge/test.csv ... finished (3/6)
event-recommendation-engine-challenge/train.csv ... finished (4/6)
event-recommendation-engine-challenge/users.csv ... finished (5/6)
event-recommendation-engine-challenge/user_friends.csv ... finished (6/6)
File loading completed!


In [4]:
# check shape of all data
print(event_attendees.shape)
print(events.shape)
print(test.shape)
print(train.shape)
print(users.shape)
print(user_friends.shape)

(1000, 5)
(1000, 110)
(1000, 4)
(1000, 6)
(1000, 7)
(1000, 2)


In [5]:
# check event_attendees examples
event_attendees.iloc[0 : 10,]

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232
5,2018671985,,,,
6,488116622,4145960786 2550625355 2577667841 1575121941 28...,1227223575 2789471603 1323321680 3086272918 38...,1413359297 2300232602 1412759254 617751520 286...,1498160155 3708150269 823488244 3595018395 173...
7,1273761447,2680366192 2151335654 3447231284 3021641283 17...,94519567 2208454642 1749189642 2558652483 2983...,3828226993 3279641599 493535713 2091128996 298...,3215817616 101410505 3172228763 4161238910 181...
8,2688888297,298428624 2292079981 1819927116 1843127538 410...,3433056329 3166442492 1754661408 2966742619 32...,143382439 552645572 2872499486 1476024415 3890...,3433837562 492244978 784111553 3319042922
9,3870329460,,4229976635,101440046 3547967849 2482041922 662878699 2600...,1200610016 379229947 1357977256 725446989


In [6]:
# create event_attendees_dict & user_interests_dict
# "1" = yes, "2" = maybe, "3" = "invited", "4" = no
# event_attendees_dict = {event_id : {"1" : [], "2" : [], "3" : [], "4" : []}}
# user_interests_dict = {user_id : {"1" : [], "2" : [], "3" : [], "4" : []}}
def create_event_attendees_dict_and_user_interests_dict(event_attendees):
    event_attendees_dict = {}
    user_interests_dict = {}
    for index, row in event_attendees.iterrows():
        event_dict = {}
        event_id = str(row[0])
        for i in range(1, len(row)):
            l = row[i]
            if str(l) == "nan":
                event_dict[str(i)] = []
            else:
                user_id_list = l.split(" ")
                event_dict[str(i)] = user_id_list
                for user_id in user_id_list:
                    if not user_id in user_interests_dict:
                        user_interests_dict[user_id] = {}
                    if str(i) in user_interests_dict[user_id]:
                        user_interests_dict[user_id][str(i)].append(event_id)
                    else:
                        user_interests_dict[user_id][str(i)] = [event_id]
        event_attendees_dict[event_id] = event_dict
    return [event_attendees_dict, user_interests_dict]

In [7]:
train.iloc[0 : 10,]

Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0
5,3044012,1532377761,0,2012-10-02 15:53:05.754000+00:00,0,0
6,4236494,2352676247,0,2012-10-30 01:48:25.617000+00:00,0,0
7,4236494,152418051,0,2012-10-30 01:48:28.645000+00:00,1,0
8,4236494,4203627753,0,2012-10-30 01:49:14.152000+00:00,1,0
9,4236494,110357109,0,2012-10-30 01:48:25.617000+00:00,0,0


In [8]:
# train_dict = {user_id : {event_id : [invited, interested, not_interested]}}
def create_train_dict(train):
    train_dict = {}
    for index, row in train.iterrows():
        user_id = str(row[0])
        event_id = str(row[1])
        invited = row[2]
        interested = row[4]
        not_interested = row[5]
        if not user_id in train_dict:
            train_dict[user_id] = {}
        if event_id in train_dict[user_id]:
            if train_dict[user_id][event_id] == [invited, interested, not_interested]:
                print("Duplicated data! for event_id, user_id: " + event_id + ", " + user_id)
            else:
                print("Multiple data! for event_id, user_id: " + event_id + ", " + user_id)
        else:
            train_dict[user_id][event_id] = [invited, interested, not_interested]
    return train_dict

In [9]:
test.iloc[0 : 10,]

Unnamed: 0,user,event,invited,timestamp
0,1776192,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,1776192,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,1776192,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,1776192,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,1776192,2972428928,0,2012-11-30 11:39:21.985000+00:00
5,1776192,2514143386,0,2012-11-30 11:39:01.230000+00:00
6,1776192,1823369186,0,2012-11-30 11:39:01.230000+00:00
7,5161061,2027962693,0,2012-11-07 16:20:11.484000+00:00
8,5161061,1652007005,0,2012-11-07 16:20:35.683000+00:00
9,5161061,2169802745,0,2012-11-07 16:20:32.153000+00:00


In [10]:
# test_dict = {user_id : {event_id : invited}}
def create_test_dict(test):
    test_dict = {}
    for index, row in test.iterrows():
        user_id = str(row[0])
        event_id = str(row[1])
        invited = row[2]
        if not user_id in test_dict:
            test_dict[user_id] = {}
        if event_id in test_dict[user_id]:
            if test_dict[user_id][event_id] == invited:
                print("Duplicated data! for event_id, user_id: " + event_id + ", " + user_id)
            else:
                print("Multiple data! for event_id, user_id: " + event_id + ", " + user_id)
        else:
            test_dict[user_id][event_id] = invited
    return test_dict

In [11]:
user_friends.iloc[0 : 10,]

Unnamed: 0,user,friends
0,3197468391,1346449342 3873244116 4226080662 1222907620 54...
1,3537982273,1491560444 395798035 2036380346 899375619 3534...
2,823183725,1484954627 1950387873 1652977611 4185960823 42...
3,1872223848,83361640 723814682 557944478 1724049724 253059...
4,3429017717,4253303705 2130310957 1838389374 3928735761 71...
5,627175141,3462311094 868148671 3475458679 1822640148 183...
6,2752000443,665103859 798664587 3773945815 595192956 31571...
7,3473687777,1481860022 3611032568 3086070277 275090520 366...
8,2966052962,4266041793 2113918851 4152864981 1055028635 12...
9,264876277,1473289379 3127593523 487736094 2990183113 249...


In [12]:
# {user_id : []}
def create_friends_dict(user_friends):
    user_friends_dict = {}
    for index, row in user_friends.iterrows():
        user_id = str(row[0])
        if str(row[1]) == "nan":
            continue;
        else:
            friends_list = row[1].split(" ")
            user_friends_dict[user_id] = friends_list
    return user_friends_dict

In [13]:
users.iloc[0 : 10,]

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0
5,627175141,ka_GE,1973,female,2012-11-01T09:59:17.590Z,Tbilisi Georgia,240.0
6,2752000443,id_ID,1994,male,2012-10-03T05:22:17.637Z,Medan Indonesia,420.0
7,3473687777,id_ID,1965,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420.0
8,2966052962,id_ID,1979,male,2012-10-31T10:11:57.668Z,Medan Indonesia,420.0
9,264876277,id_ID,1988,female,2012-10-02T07:28:09.555Z,Medan Indonesia,420.0


In [14]:
# {user_id : [birthday, gender]}
def create_users_dict(users):
    user_dict = {}
    for index, row in users.iterrows():
        user_id = str(row[0])
        birthday = row[2]
        gender = row[3]
        user_dict[user_id] = [birthday, gender]
    return user_dict

In [15]:
events.iloc[0 : 10,]

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9
5,1212611096,1426522332,2012-11-16T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,22
6,3689283674,725266702,2012-11-02T20:00:00.003Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,28
7,2584113432,613687941,2012-10-31T00:00:00.001Z,,,,,,,0,...,2,0,0,0,0,0,0,0,0,354
8,3365728297,1098509207,2012-10-31T00:00:00.001Z,,,,,47.058,21.926,0,...,0,0,0,0,0,0,0,1,0,25
9,2912638473,3598071768,2012-10-18T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,3


In [24]:
# {event_id : [host_user_id, [c_list]]}
def create_events_dict(events):
    events_dict = {}
    for index, row in events.iterrows():
        event_id = str(row[0])
        host_id = str(row[1])
        c_list = list(row[9 : -1])
        events_dict[event_id] = [host_id, c_list]
    return events_dict

In [19]:
event_attendees_dict, user_interests_dict = create_event_attendees_dict_and_user_interests_dict(event_attendees)

In [20]:
train_dict = create_train_dict(train)

Duplicated data! for event_id, user_id: 1462902079, 203456139
Duplicated data! for event_id, user_id: 4242816413, 203456139
Duplicated data! for event_id, user_id: 498238691, 203456139
Duplicated data! for event_id, user_id: 745914541, 203456139


In [21]:
test_dict = create_test_dict(test)

Duplicated data! for event_id, user_id: 3258757385, 77627621
Duplicated data! for event_id, user_id: 1498021187, 282335733


In [22]:
friends_dict = create_friends_dict(user_friends)

In [23]:
users_dict = create_users_dict(users)

In [29]:
events_dict = create_events_dict(events)

In [97]:
a = ["1", "3", "3"]
b = ["1", "3"]
b.append("3")
a==b

True