# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Playlists dataset

In [2]:
playlists = pd.read_csv('Data/playlists_final.csv', sep='\t')

### Title column conversion to list of integers

In [3]:
import string

def stringParsing(listString):
    x = "".join(x for x in listString if x not in string.punctuation)
    x = list(map(int, x.split()))

    return x

In [4]:
playlists['title'] = playlists['title'].apply(stringParsing)

### Dataset inspection

In [5]:
playlists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57561 entries, 0 to 57560
Data columns (total 6 columns):
created_at     57561 non-null int64
playlist_id    57561 non-null int64
title          57561 non-null object
numtracks      57561 non-null int64
duration       57561 non-null int64
owner          57561 non-null int64
dtypes: int64(5), object(1)
memory usage: 2.6+ MB


In [6]:
playlists.head()

Unnamed: 0,created_at,playlist_id,title,numtracks,duration,owner
0,1216545588,644838,[12727],27,6522,41504
1,1249326867,7577564,[],9,2650,41504
2,1257766688,3120683,[183],16,3645,44542
3,1248079275,4278112,"[12389, 18698, 18925, 11695, 7117]",15,4151,44542
4,1175201268,8656823,"[12809, 2095, 13257, 12671, 20426, 14448, 18698]",84,18414,44542


In [7]:
tracks = pd.read_csv('Data/tracks_final.csv', sep='\t')

In [8]:
tracks['tags'] = tracks['tags'].apply(stringParsing)
tracks.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [9]:
target_tracks = pd.read_csv('Data/target_tracks.csv', sep='\t')

In [10]:
target_tracks.head()

Unnamed: 0,track_id
0,1316175
1,3885714
2,3091270
3,226759
4,230596


In [51]:
target_playlists = pd.read_csv('Data/target_playlists.csv', sep='\t', index_col='playlist_id')

In [52]:
target_playlists.head()

10024884
10624787
4891851
4267369
65078


In [13]:
# Playlist - Tracks interactions
interactions = pd.read_csv('Data/train_final.csv', sep='\t')

In [14]:
interactions.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [15]:
print(interactions.shape)
print(target_playlists.shape)
print(target_tracks.shape)

(1040522, 2)
(10000, 1)
(32195, 1)


### Top Popular

In [16]:
tracks_popularity = interactions['track_id'].value_counts()
most_popular = tracks_popularity[:10].index.values

In [17]:
list_target_playlists = target_playlists.iloc[:, 0].values
interactions_playlist = interactions.iloc[:, 0].values

In [18]:
res = set(list_target_playlists).intersection(set(interactions_playlist))

In [19]:
len(res)

10000

In [20]:
len(list(set(interactions_playlist)))

45649

In [21]:
most_popular

array([1563309, 1363985, 3705881, 1595978, 3166665, 3779477,  204966,
       2863395, 1580480, 1156143], dtype=int64)

In [22]:
interactions.shape


(1040522, 2)

In [23]:
unique_playlists = interactions['playlist_id'].unique()
unique_playlists.shape

(45649,)

In [28]:
df_playlists = pd.DataFrame(index=unique_playlists)

In [29]:
def fill_tracks(playlist_id, df_interactions):
    df_temp = df_interactions[df_interactions['playlist_id']==playlist_id]
    return df_temp['track_id'].values

In [30]:
df_playlists['tracks_added'] = df_playlists.index.map(lambda x: fill_tracks(x, interactions))

In [31]:
df_playlists

Unnamed: 0,tracks_added
3271849,"[2801526, 2372619, 1187176, 1011111, 437130, 2..."
5616275,"[727878, 2077024, 170846, 2182913, 1973880, 19..."
11267488,"[2805283, 3329183, 647761, 846652, 225495, 247..."
10103900,"[1515105, 496282, 2831388, 1739102, 301843, 24..."
3836898,"[2945623, 859729, 2012408, 350109, 83121, 7316..."
5270369,"[2821391, 466626, 2368040, 2130426, 2846035, 1..."
3794808,"[1166185, 3738260, 3670927, 269274, 3478759, 7..."
7908370,"[2498280, 3213312, 3862221, 1764710, 3785951, ..."
11460733,"[282687, 2296940, 2397711, 2947879, 1536432, 3..."
886396,"[863177, 260151, 2016699, 1525556, 1721962, 26..."


In [32]:
def recommend_TopPop(row, top_tracks):
    j = 0
    res = []
    for i in range(0,5):
        if top_tracks[i] not in row:
            res.append(top_tracks[i])
        else:
            j += 1
    
    if j > 0:
        res.extend(top_tracks[5:5+j])
    
    return " ".join(str(x) for x in res)
    
#     return res_str[:-1]

In [33]:
df_playlists['recommended_tracks'] = df_playlists['tracks_added'].apply(lambda x: recommend_TopPop(x, most_popular))

In [34]:
most_popular

array([1563309, 1363985, 3705881, 1595978, 3166665, 3779477,  204966,
       2863395, 1580480, 1156143], dtype=int64)

In [35]:
df_playlists

Unnamed: 0,tracks_added,recommended_tracks
3271849,"[2801526, 2372619, 1187176, 1011111, 437130, 2...",1563309 1363985 3705881 1595978 3166665
5616275,"[727878, 2077024, 170846, 2182913, 1973880, 19...",1563309 1363985 3705881 1595978 3166665
11267488,"[2805283, 3329183, 647761, 846652, 225495, 247...",1563309 1363985 3705881 1595978 3166665
10103900,"[1515105, 496282, 2831388, 1739102, 301843, 24...",1563309 1363985 1595978 3166665 3779477
3836898,"[2945623, 859729, 2012408, 350109, 83121, 7316...",1563309 1363985 3705881 1595978 3166665
5270369,"[2821391, 466626, 2368040, 2130426, 2846035, 1...",1563309 1363985 3705881 1595978 3166665
3794808,"[1166185, 3738260, 3670927, 269274, 3478759, 7...",1563309 1363985 3705881 1595978 3166665
7908370,"[2498280, 3213312, 3862221, 1764710, 3785951, ...",1563309 1363985 1595978 3166665 3779477
11460733,"[282687, 2296940, 2397711, 2947879, 1536432, 3...",1563309 1363985 3705881 1595978 3166665
886396,"[863177, 260151, 2016699, 1525556, 1721962, 26...",1563309 1363985 3705881 1595978 3166665


In [36]:
recommend_top_pop = df_playlists.loc[:,'recommended_tracks']

In [57]:
recommend_top_pop_target = pd.merge(target_playlists, pd.DataFrame(recommend_top_pop), left_index=True, right_index=True)

In [60]:
recommend_top_pop_target.head()

Unnamed: 0,recommended_tracks
10024884,1563309 1363985 3705881 1595978 3166665
10624787,1563309 1363985 3705881 1595978 3166665
4891851,1563309 1363985 3705881 1595978 3166665
4267369,1563309 1363985 3705881 1595978 3166665
65078,1563309 1363985 3705881 1595978 3166665


In [61]:
recommend_top_pop_target.to_csv('top_pop_final.csv', sep=',')

In [65]:
rec = pd.read_csv('top_pop_final.csv', sep=',')

In [66]:
rec.head()

Unnamed: 0,playlist_id,track_ids
0,10024884,1563309 1363985 3705881 1595978 3166665
1,10624787,1563309 1363985 3705881 1595978 3166665
2,4891851,1563309 1363985 3705881 1595978 3166665
3,4267369,1563309 1363985 3705881 1595978 3166665
4,65078,1563309 1363985 3705881 1595978 3166665


In [67]:
sample_submission = pd.read_csv('Data/sample_submission.csv', sep=',')
sample_submission.head()

Unnamed: 0,playlist_id,track_ids
0,6979584,1 2 3 4 5
1,10485762,1 2 3 4 5
2,4128771,1 2 3 4 5
3,11337735,1 2 3 4 5
4,5275664,1 2 3 4 5


In [68]:
print(rec.shape)
print(sample_submission.shape)

(10000, 2)
(10000, 2)
