In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
from sklearn.utils import shuffle

In [4]:
import warnings
warnings.filterwarnings("ignore")

### 1. Load data files as dataframes

I load .idomaar files as dataframes because it is more familiar format for me and it is easier to work with.

In [6]:
tracks_df = pd.read_csv("ThirtyMusic/entities/tracks.idomaar",sep='\t', 
                        names = ['object', 'id', 'ts', 'properties', 'linked_entities'])
tracks_df.head()

Unnamed: 0,object,id,ts,properties,linked_entities
0,track,0,-1,"{""duration"":-1,""playcount"":4,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":0}],""albums""..."
1,track,1,-1,"{""duration"":-1,""playcount"":495,""MBID"":null,""na...","{""artists"":[{""type"":""person"",""id"":1}],""albums""..."
2,track,2,-1,"{""duration"":-1,""playcount"":2,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":2}],""albums""..."
3,track,3,-1,"{""duration"":-1,""playcount"":2,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":3}],""albums""..."
4,track,4,-1,"{""duration"":-1,""playcount"":1,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":4}],""albums""..."


In [7]:
tracks_df.shape

(5675143, 5)

In [8]:
sessions_df = pd.read_csv("ThirtyMusic/relations/sessions.idomaar",sep='\t| ', 
                        names = ['object', 'id', 'ts', 'properties', 'linked_entities'])
sessions_df.head()

Unnamed: 0,object,id,ts,properties,linked_entities
0,event.session,287144,1390231051,"{""numtracks"":23,""playtime"":4547}","{""subjects"":[{""type"":""user"",""id"":44361}],""obje..."
1,event.session,287145,1390241844,"{""numtracks"":11,""playtime"":2907}","{""subjects"":[{""type"":""user"",""id"":44361}],""obje..."
2,event.session,287146,1390303249,"{""numtracks"":16,""playtime"":3191}","{""subjects"":[{""type"":""user"",""id"":44361}],""obje..."
3,event.session,287147,1390481828,"{""numtracks"":5,""playtime"":1162}","{""subjects"":[{""type"":""user"",""id"":44361}],""obje..."
4,event.session,287140,1421443687,"{""numtracks"":2,""playtime"":250}","{""subjects"":[{""type"":""user"",""id"":42773}],""obje..."


In [9]:
sessions_df.shape

(2764474, 5)

### 2. Form user_id and track_id columns, create new sessions dataframe with these columns

In [10]:
def extract_track_and_user(row):
    '''Extracts user_id and track_id from sessions data'''
    json_lst = json.loads(row['linked_entities'])
    user_id = json_lst['subjects'][0]['id']
    track_ids = [obj['id'] for obj in json_lst['objects']]
    new_rows_dict = {
                    'session_id' : row['id'],
                    'ts': row['ts'],
                    'user_id' : user_id,
                    'track_id' : track_ids 
                   } 
    return pd.DataFrame(new_rows_dict)

In [11]:
%%time
#apply custom function to sessions_df, form new df
new_sessions_df = pd.concat(list(sessions_df.apply(lambda row: extract_track_and_user(row), axis=1)))

CPU times: user 37min 37s, sys: 59.1 s, total: 38min 36s
Wall time: 38min 47s


In [12]:
new_sessions_df = new_sessions_df.reset_index(drop=True)

In [13]:
new_sessions_df

Unnamed: 0,session_id,ts,user_id,track_id
0,287144,1390231051,44361,4698874
1,287144,1390231051,44361,838286
2,287144,1390231051,44361,2588097
3,287144,1390231051,44361,2746740
4,287144,1390231051,44361,3873988
...,...,...,...,...
31351940,2480032,1407938059,33058,906373
31351941,2480033,1407939579,33058,512708
31351942,2480033,1407939579,33058,2672866
31351943,2480033,1407939579,33058,1558581


In [14]:
#save new sessions df to pickle
new_sessions_df.to_pickle('data/new_sessions_df.pickle')

### 3. Form person_id column, create new tracks dataframe with this column

In [15]:
def extract_person(row):
    json_lst = json.loads(row['linked_entities'])
    person_id = json_lst['artists'][0]['id']

    new_rows_dict = {
                    'track_id' : row['id'],
                    'person_id' : person_id,
                   } 
    return new_rows_dict

In [16]:
%%time
new_tracks_df = pd.DataFrame(list(tracks_df.apply(lambda row: extract_person(row), axis=1)))

CPU times: user 2min 32s, sys: 1.55 s, total: 2min 33s
Wall time: 2min 35s


In [17]:
new_tracks_df = new_tracks_df.drop_duplicates()

In [18]:
new_tracks_df

Unnamed: 0,track_id,person_id
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
5675137,5023104,595139
5675138,5023105,187223
5675140,5023106,595140
5675141,5023107,549669


In [19]:
new_tracks_df.to_pickle('data/new_tracks_df.pickle')

### 4. Merge new sessions and tracks dataframes

In [20]:
new_sessions_df = pd.read_pickle('data/new_sessions_df.pickle')
new_tracks_df = pd.read_pickle('data/new_tracks_df.pickle')

In [21]:
new_sessions_df.shape

(31351945, 4)

In [22]:
new_tracks_df.shape

(4519319, 2)

In [23]:
new_tracks_df['track_id'].value_counts()

1744350    2
976246     2
94083      2
151793     2
915661     2
          ..
3325182    1
3321084    1
3316986    1
3312888    1
0          1
Name: track_id, Length: 4519105, dtype: int64

In [24]:
%%time
merged_df = new_sessions_df.merge(new_tracks_df, on=['track_id'], how='inner', copy = False)

CPU times: user 8.02 s, sys: 641 ms, total: 8.66 s
Wall time: 8.02 s


In [25]:
merged_df.head()

Unnamed: 0,session_id,ts,user_id,track_id,person_id
0,287144,1390231051,44361,4698874,142266
1,287144,1390231051,44361,838286,107103
2,982046,1405452797,32894,838286,107103
3,982049,1405702985,32894,838286,107103
4,1873088,1406217037,23183,838286,107103


In [26]:
merged_df.shape

(31376602, 5)

In [27]:
merged_df.to_pickle('data/merged_df.pickle')

### 5. Timestamp to date and time

In [4]:
merged_df = pd.read_pickle('data/merged_df.pickle')

In [5]:
date_time = pd.to_datetime(merged_df['ts'], unit = 's').astype(str).str.split(" ", n = 1, expand = True)

In [6]:
merged_df['date'] = date_time[0]
merged_df['time'] = date_time[1]

In [7]:
merged_df

Unnamed: 0,session_id,ts,user_id,track_id,person_id,date,time
0,287144,1390231051,44361,4698874,142266,2014-01-20,15:17:31
1,287144,1390231051,44361,838286,107103,2014-01-20,15:17:31
2,982046,1405452797,32894,838286,107103,2014-07-15,19:33:17
3,982049,1405702985,32894,838286,107103,2014-07-18,17:03:05
4,1873088,1406217037,23183,838286,107103,2014-07-24,15:50:37
...,...,...,...,...,...,...,...
31376597,540638,1415960966,24700,2564433,321698,2014-11-14,10:29:26
31376598,540638,1415960966,24700,2602587,325968,2014-11-14,10:29:26
31376599,540638,1415960966,24700,720394,87396,2014-11-14,10:29:26
31376600,540638,1415960966,24700,720401,87396,2014-11-14,10:29:26


In [8]:
merged_df.to_pickle('data/merged_df_datetime.pickle')

### 6. Get artists dictionary (person_id: name)

In [10]:
people_df = pd.read_csv("ThirtyMusic/entities/persons.idomaar",sep='\t', 
                        names = ['object', 'id', 'ts', 'properties', 'linked_entities'])
people_df.head()

Unnamed: 0,object,id,ts,properties,linked_entities
0,person,145148,-1,"{""MBID"":null, ""name"":""Everything+Is+Illuminated""}",{}
1,person,297899,-1,"{""MBID"":null, ""name"":""Robin+O%27Brien""}",{}
2,person,250429,-1,"{""MBID"":null, ""name"":""Nicholas+Gunn++(2012)""}",{}
3,person,32765,-1,"{""MBID"":null, ""name"":""Aspasia+Stratigou""}",{}
4,person,18689,-1,"{""MBID"":null, ""name"":""Allison+Veltz""}",{}


In [11]:
people_df.shape

(595049, 5)

In [12]:
people_df = people_df.drop_duplicates(subset=['id'], keep = 'first')

In [13]:
people_df

Unnamed: 0,object,id,ts,properties,linked_entities
0,person,145148,-1,"{""MBID"":null, ""name"":""Everything+Is+Illuminated""}",{}
1,person,297899,-1,"{""MBID"":null, ""name"":""Robin+O%27Brien""}",{}
2,person,250429,-1,"{""MBID"":null, ""name"":""Nicholas+Gunn++(2012)""}",{}
3,person,32765,-1,"{""MBID"":null, ""name"":""Aspasia+Stratigou""}",{}
4,person,18689,-1,"{""MBID"":null, ""name"":""Allison+Veltz""}",{}
...,...,...,...,...,...
595044,person,544215,-1,"{""MBID"":null, ""name"":""Sanaa+Kariakoo""}",{}
595045,person,298403,-1,"{""MBID"":null, ""name"":""Rock-a-teens""}",{}
595046,person,450896,-1,"{""MBID"":null, ""name"":""Jennifer+Lopez+Ft.+DJ+Mu...",{}
595047,person,53831,-1,"{""MBID"":null, ""name"":""Bobby+Sanabria+Conductin...",{}


In [14]:
def extract_person_name(row):
    json_lst = json.loads(row['properties'])
    person_id = row['id']

    new_rows_dict = {
                    'person_id' : person_id,
                    'person_name' : json_lst['name']
                   } 
    return new_rows_dict

In [15]:
%%time
new_persons_df = pd.DataFrame(list(people_df.apply(lambda row: extract_person_name(row), axis=1)))

CPU times: user 15.1 s, sys: 31 ms, total: 15.1 s
Wall time: 14.7 s


In [16]:
new_persons_df

Unnamed: 0,person_id,person_name
0,145148,Everything+Is+Illuminated
1,297899,Robin+O%27Brien
2,250429,Nicholas+Gunn++(2012)
3,32765,Aspasia+Stratigou
4,18689,Allison+Veltz
...,...,...
560922,544215,Sanaa+Kariakoo
560923,298403,Rock-a-teens
560924,450896,Jennifer+Lopez+Ft.+DJ+Mustard
560925,53831,Bobby+Sanabria+Conducting+The+Manhattan+School...


In [17]:
new_persons_df.to_pickle('data/new_persons_df.pickle')

### 7. Prepare train, validation and test datasets

In [5]:
df = pd.read_pickle('data/merged_df_datetime.pickle')

In [6]:
df.head()

Unnamed: 0,session_id,ts,user_id,track_id,person_id,date,time
0,287144,1390231051,44361,4698874,142266,2014-01-20,15:17:31
1,287144,1390231051,44361,838286,107103,2014-01-20,15:17:31
2,982046,1405452797,32894,838286,107103,2014-07-15,19:33:17
3,982049,1405702985,32894,838286,107103,2014-07-18,17:03:05
4,1873088,1406217037,23183,838286,107103,2014-07-24,15:50:37


In [7]:
df.shape

(31376602, 7)

In [8]:
set(df['date'])

{'2014-01-20',
 '2014-01-21',
 '2014-01-22',
 '2014-01-23',
 '2014-01-24',
 '2014-01-25',
 '2014-01-26',
 '2014-01-27',
 '2014-01-28',
 '2014-01-29',
 '2014-01-30',
 '2014-01-31',
 '2014-02-01',
 '2014-02-02',
 '2014-02-03',
 '2014-02-04',
 '2014-02-05',
 '2014-02-06',
 '2014-02-07',
 '2014-02-08',
 '2014-02-09',
 '2014-02-10',
 '2014-02-11',
 '2014-02-12',
 '2014-02-13',
 '2014-02-14',
 '2014-02-15',
 '2014-02-16',
 '2014-02-17',
 '2014-02-18',
 '2014-02-19',
 '2014-02-20',
 '2014-02-21',
 '2014-02-22',
 '2014-02-23',
 '2014-02-24',
 '2014-02-25',
 '2014-02-26',
 '2014-02-27',
 '2014-02-28',
 '2014-03-01',
 '2014-03-02',
 '2014-03-03',
 '2014-03-04',
 '2014-03-05',
 '2014-03-06',
 '2014-03-07',
 '2014-03-08',
 '2014-03-09',
 '2014-03-10',
 '2014-03-11',
 '2014-03-12',
 '2014-03-13',
 '2014-03-14',
 '2014-03-15',
 '2014-03-16',
 '2014-03-17',
 '2014-03-18',
 '2014-03-19',
 '2014-03-20',
 '2014-03-21',
 '2014-03-22',
 '2014-03-23',
 '2014-03-24',
 '2014-03-25',
 '2014-03-26',
 '2014-03-

In [9]:
users = df["user_id"].unique().tolist()
len(users)

45175

In [10]:
# 80% of user ids
users_train = [users[i] for i in range(round(0.8*len(users)))]

In [11]:
#20% of user ids from train
users_val = [users[i] for i in range(round(0.2*len(users_train)))]

In [12]:
len(users_train)

36140

In [13]:
len(users_val)

7228

In [14]:
%%time
# split data into train and test set (80%-20% of users)
train_df = df[df['user_id'].isin(users_train)]
test_df = df[~df['user_id'].isin(users_train)]

CPU times: user 22.5 s, sys: 4.7 s, total: 27.2 s
Wall time: 25.3 s


In [15]:
train_df.shape

(29470337, 7)

In [16]:
%%time
#randomly choose user-artist pairs from users_val to exclude it from train
train_modified = shuffle(train_df)
left_out_df = train_modified[train_modified['user_id'].isin(users_val)].drop_duplicates(subset=['user_id'])

CPU times: user 36.6 s, sys: 1.72 s, total: 38.3 s
Wall time: 38.7 s


In [17]:
left_out_df = left_out_df[['user_id', 'person_id']]

In [18]:
left_out_df

Unnamed: 0,user_id,person_id
31125591,40215,164287
2185310,28749,42398
16687006,8046,367332
19484714,8749,21753
13252169,7863,450200
...,...,...
5723175,25238,46425
445058,1858,46425
9482149,23901,78622
7051855,5381,274779


In [19]:
index1 = pd.MultiIndex.from_arrays([train_modified[col] for col in ['user_id', 'person_id']])
index2 = pd.MultiIndex.from_arrays([left_out_df[col] for col in ['user_id', 'person_id']])

In [20]:
%%time
train_modified = train_modified.loc[~index1.isin(index2)]

CPU times: user 20.6 s, sys: 1min 13s, total: 1min 33s
Wall time: 1min 39s


In [21]:
train_modified.shape

(29242161, 7)

In [22]:
train_modified = train_modified[['user_id', 'person_id', 'ts']].sort_values(by = ['user_id', 'ts'])
train_modified['person_id'] = train_modified['person_id'].astype(str)
train_mod_grouped = train_modified.groupby(by=['user_id'])['person_id'].apply(list).reset_index(name='persons_lst')

In [23]:
test_df['person_id'] = test_df['person_id'].astype(str)
test_grouped = test_df.groupby(by=['user_id'])['person_id'].apply(list).reset_index(name='persons_lst')

In [24]:
test_grouped

Unnamed: 0,user_id,persons_lst
0,2,"[317952, 317952, 317952, 307932, 147326, 33277..."
1,6,"[16163, 427911, 427911, 347719, 81351, 81351, ..."
2,9,"[56683, 208667, 144662, 299571, 28752, 356730,..."
3,10,"[263881, 302401]"
4,19,"[15830, 254536, 305083, 2589, 75802, 75802, 75..."
...,...,...
9030,45138,"[323344, 323344, 323344, 323344, 411805, 41180..."
9031,45147,"[228324, 320708, 348909, 348909, 437722, 43772..."
9032,45148,"[54198, 218992, 190028, 192393, 335800, 288031..."
9033,45172,"[196920, 11653, 123799, 138443, 345801, 408513..."


In [25]:
left_out_df['person_id'] = left_out_df['person_id'].astype(str)

In [26]:
left_out_df.to_pickle('data/left_out_df.pickle')
train_modified.to_pickle('data/train_mod.pickle')
train_mod_grouped.to_pickle('data/train_mod_grouped.pickle')
test_grouped.to_pickle('data/test_grouped.pickle')

### 8. Prepare additional information - user likes

In [39]:
love_df = pd.read_csv("ThirtyMusic/relations/love.idomaar",sep='\t| ', 
                      names = ['object', 'id', 'ts','values','properties', 'linked_entities'])
love_df.head()

Unnamed: 0,object,id,ts,values,properties,linked_entities
0,preference,1,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}],","""objects"":[{""type"":""track"",""id"":2785601}]}"
1,preference,2,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}],","""objects"":[{""type"":""track"",""id"":2785590}]}"
2,preference,3,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}],","""objects"":[{""type"":""track"",""id"":143076}]}"
3,preference,4,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}],","""objects"":[{""type"":""track"",""id"":143037}]}"
4,preference,5,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}],","""objects"":[{""type"":""track"",""id"":143052}]}"


In [40]:
love_df.shape

(4106341, 6)

In [41]:
love_df['properties'] = love_df['properties'] + " " +love_df['linked_entities']
love_df.drop(columns = ['linked_entities'], inplace=True)

In [42]:
love_df

Unnamed: 0,object,id,ts,values,properties
0,preference,1,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}], ""obj..."
1,preference,2,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}], ""obj..."
2,preference,3,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}], ""obj..."
3,preference,4,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}], ""obj..."
4,preference,5,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":44542}], ""obj..."
...,...,...,...,...,...
4106336,preference,4106337,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":39896}], ""obj..."
4106337,preference,4106338,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":39896}], ""obj..."
4106338,preference,4106339,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":39896}], ""obj..."
4106339,preference,4106340,-1,"{""value"":""love""}","{""subjects"":[{""type"":""user"",""id"":39896}], ""obj..."


In [43]:
set(love_df['ts'])

{-1}

In [44]:
set(love_df['values'])

{'{"value":"love"}'}

In [45]:
love_df = love_df.drop_duplicates(subset = ['properties'])

In [46]:
def extract_likes(row):
    json_lst = json.loads(row['properties'])
    
    user_id = json_lst['subjects'][0]['id']
    
    track_id = json_lst['objects'][0]['id']

    new_rows_dict = {
                    'user_id' : user_id,
                    'track_id' : track_id
                   } 
    return new_rows_dict

In [47]:
%%time
likes_df = pd.DataFrame(list(love_df.apply(lambda row: extract_likes(row), axis=1)))

CPU times: user 32.6 s, sys: 200 ms, total: 32.8 s
Wall time: 32.3 s


In [48]:
likes_df

Unnamed: 0,user_id,track_id
0,44542,2785601
1,44542,2785590
2,44542,143076
3,44542,143037
4,44542,143052
...,...,...
1692556,39896,3208469
1692557,39896,2756716
1692558,39896,3644495
1692559,39896,2247239


In [49]:
#df = pd.read_pickle('data/merged_df_datetime.pickle')

In [50]:
df2 = df[['track_id', 'person_id']].drop_duplicates()

In [51]:
df2

Unnamed: 0,track_id,person_id
0,4698874,142266
1,838286,107103
157,2588097,324333
174,2746740,344448
185,3873988,309348
...,...,...
31376597,2564433,321698
31376598,2602587,325968
31376599,720394,87396
31376600,720401,87396


In [52]:
likes_df = likes_df.merge(df2, on = ['track_id'], how = 'left')
likes_df.drop(columns=['track_id'], inplace = True)
likes_df

Unnamed: 0,user_id,person_id
0,44542,349295
1,44542,349295
2,44542,18754
3,44542,18754
4,44542,18754
...,...,...
1694167,39896,397664
1694168,39896,345801
1694169,39896,455051
1694170,39896,31464


In [53]:
likes_df = likes_df[['user_id', 'person_id']].groupby(['user_id', 
                                                       'person_id']).size().reset_index(name='likes_count')

In [54]:
likes_df

Unnamed: 0,user_id,person_id,likes_count
0,1,87999,2
1,2,184769,1
2,2,408273,2
3,2,459046,2
4,2,464952,1
...,...,...,...
874471,45174,338132,1
874472,45174,349288,1
874473,45174,357381,2
874474,45174,378192,1


In [55]:
likes_df['likes_count'].value_counts()

1      624758
2      117751
3       47476
4       24873
5       15230
        ...  
98          1
97          1
94          1
90          1
436         1
Name: likes_count, Length: 139, dtype: int64

In [56]:
likes_df.to_pickle('data/likes_df.pickle')