In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

## Load Data to Pandas

In [3]:
df1 = pd.read_csv('data/play_ds.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df1.drop(['device', 'song_length', 'date'], inplace=True, axis=1, errors='ignore')

In [5]:
df1.head(20)

Unnamed: 0,uid,song_id,play_time
0,168539760,4732050.0,197
1,168543026,6623030.0,0
2,168550571,0.0,24
3,168547857,4356300.0,3
4,168551487,811133.0,200
5,168519439,0.0,227
6,168550571,0.0,4
7,168551579,8762280.0,213
8,168551008,58391.0,89
9,168550564,708741.0,258


In [6]:
df1.song_id.count()

6797518

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6799624 entries, 0 to 6799623
Data columns (total 3 columns):
uid          int64
song_id      object
play_time    object
dtypes: int64(1), object(2)
memory usage: 155.6+ MB


In [8]:
df1['play_time'] = pd.to_numeric(df1['play_time'],errors='coerce')
df1['play_time'].fillna(0, inplace=True)
df1 = df1[df1['play_time'] != 0]

In [9]:
df1['song_id'] = pd.to_numeric(df1['song_id'],errors='coerce')
# df1['song_id'].dropna( inplace=True)

In [10]:
df1 = df1[df1['song_id'] != 0]

In [11]:
df1 = df1.dropna()

In [12]:
df1.tail()

Unnamed: 0,uid,song_id,play_time
6799619,167871618,9891560.0,59.0
6799620,167924823,4859450.0,79.0
6799621,167584757,23651600.0,241.0
6799622,167979343,6359030.0,266.0
6799623,167867101,16827800.0,327.0


In [13]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5131554 entries, 0 to 6799623
Data columns (total 3 columns):
uid          int64
song_id      object
play_time    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 156.6+ MB


## Convert rating records to user-movie utility matrix

In [31]:
# df_utility = pd.pivot_table(data=df1, 
#                             values='play_time', 
#                             index='uid', 
#                             columns='song_id', 
#                             fill_value=0
#                            )

# get Memory error after running codes in this cell,
# down sample and run again

#### Down sampling


In [15]:
# downsample ids
from sklearn.utils import resample
df2 = resample(df1, 
               replace=False,    # sample without replacement
               n_samples=51315,     # to match minority class
               random_state=123) #

In [62]:
df2.tail()

Unnamed: 0,uid,song_id,play_time
4392429,168871920,5910727.0,44.0
1297683,168298282,3620542.0,277.0
5112448,168560832,5237384.0,125.0
2475306,168442387,2203376.0,3.0
1206019,167973481,703078.0,1.0


In [17]:
df2['song_id'] = pd.to_numeric(df2['song_id'],errors='coerce')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51315 entries, 4764446 to 1206019
Data columns (total 3 columns):
uid          51315 non-null int64
song_id      51310 non-null float64
play_time    51315 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.6 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_utility = pd.pivot_table(data=df2, 
                            values='play_time', 
                            index='uid', 
                            columns='song_id', 
                            fill_value=0
                           )


In [46]:
df_utility

song_id,-1.0,0.0,1614.0,1620.0,1652.0,1672.0,1699.0,1734.0,1872.0,2076.0,...,23674687.0,23676164.0,23676232.0,23678860.0,23679529.0,23679811.0,23680765.0,1.46220965703e+18,1.63917943567e+18,1.81380421448e+18
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1164092,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3152758,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3748366,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4987532,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
5029565,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
5356567,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
6216081,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
6454650,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
6744211,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
7950557,0,0.0,0,0,0,0,0.0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_utility.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19513 entries, 1164092 to 169258350
Columns: 24183 entries, -1.0 to 1.81380421448e+18
dtypes: float64(902), int64(23281)
memory usage: 3.5 GB


In [20]:
df2.song_id.count

<bound method Series.count of 4764446     6948019.0
4875842       72610.0
1110343    15807836.0
4907381    16796877.0
2330859      157612.0
5549141     6487170.0
4733372      511297.0
5126177      169744.0
3682871     3219312.0
331840      4101910.0
2323803     4074082.0
4901999      111642.0
357613      6770626.0
2184795     3362790.0
3358199           0.0
4808212      138894.0
986421     11324610.0
1547441     5216147.0
4166251      454444.0
250565      6221861.0
271352     22964472.0
5609164      313496.0
3522249      529964.0
5093879      875443.0
3182555    23489524.0
6743       22850224.0
2645672     4683301.0
6673187     4405617.0
1582222     6769135.0
168970     20129716.0
              ...    
1217615    11914642.0
1754178      274686.0
767560       683122.0
121503      6401626.0
351807      6581379.0
3998317     4881081.0
4236383      461747.0
3717694     3415560.0
3699399      473794.0
185345      7000662.0
4728320     2833409.0
6456113     6657692.0
6685591     5298834.0
36

In [32]:
# another way to generate  utility matrix


# highest_user_id = df2.uid.max()
# highest_song_id = 4732047
# playtime_mat = sparse.lil_matrix((highest_user_id, highest_song_id))
# playtime_mat

# for _, row in df1.iterrows():
#     # subtract 1 from id's due to match 0 indexing
#     playtime_mat[row.uid-1, row.song_id-1] = row.play_time

## Calculate item-item similarity matrix

In [21]:
utility_mat = df_utility
item_sim_mat = cosine_similarity(utility_mat.T)

In [39]:
item_sim_mat

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   2.94365461e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   2.94365461e-04,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00]])

In [42]:
item_sim_mat.shape

(24183, 24183)

## Calculate neighborhood

In [22]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [43]:
neighborhoods

array([[ 8098,  8099,  8100, ...,  8074,  9798,     0],
       [ 1271, 11169, 21828, ..., 20932,  5210,     1],
       [ 8118,  8117,  8116, ..., 12777, 11253,     2],
       ..., 
       [ 8097,  8098,  8099, ...,  6844, 24180, 19885],
       [ 8079,  8098,  8099, ...,  8055,  8064, 24181],
       [ 8097,  8098,  8099, ...,   475, 17183, 24182]])

In [38]:
neighborhoods.shape

(24183, 75)

## Make rating prediction on a user

In [63]:
# Pick a lucky user
uid = 168298282

In [70]:
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
songslistened_by_this_user = df_utility[uid].nonzero()[1]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    songslistened_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = df_utility[uid, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

## Get final recommendations for a user

In [71]:
# Recommend n songs
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

# Find items that have been linstened by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# Exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]