In [44]:
import numpy as np
import csv
import pandas as pd
from scipy.sparse import csr_matrix

# Predict via the user-specific median.
# If the user has no data, use the global median.

train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'k_means.csv'
profiles_file='profiles.csv'
artists_file='artists.csv'

In [45]:
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)
df_profiles = pd.read_csv(profiles_file)
df_artists = pd.read_csv(artists_file)

In [46]:
df_train.head(5)

Unnamed: 0,user,artist,plays
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554
1,44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708
3,8fa49ab25d425edcf05d44bfc1d5aea895287d81,a1419808-65d3-4d40-998c-1a0bac65eabc,265
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220


In [47]:
df_train_frequency=df_train[['user', 'artist']].groupby('artist', as_index=False).count()

In [48]:
df_train_frequency.head()

Unnamed: 0,artist,user
0,000d90ec-d64c-48a1-b775-e726fd240e9f,510
1,000fc734-b7e1-4a01-92d1-f544261b43f5,1873
2,0019749d-ee29-4a5f-ab17-6bfa11deb969,690
3,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,14757
4,004e5eed-e267-46ea-b504-54526f1f377d,1580


In [49]:
df_test.head(5)

Unnamed: 0,Id,user,artist
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec


In [50]:
users_profiles=set(df_profiles['user'].values)
users_test=set(df_test['user'].values)
users_train=set(df_train['user'].values)
users_number=len(users_profiles)
print 'users number', users_number

users number 233286


In [51]:
artists_artists=set(df_artists['artist'].values)
artists_test=set(df_test['artist'].values)
artists_train=set(df_train['artist'].values)
artists_number=len(artists_artists)
print 'artists number', artists_number

artists number 2000


In [52]:
#dictionary for users id
dict_users=dict(zip(users_profiles, range(len(users_profiles))))
#dictionary for artists id
dict_artists=dict(zip(artists_artists, range(len(artists_artists))))

In [53]:
# Load the training data.
train_data= {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
        user_id=dict_users[user]
        artist_id=dict_artists[artist]
        
        if not user_id in train_data:
            train_data[user_id] = {}
        
        train_data[user_id][artist_id] = int(plays)

In [54]:
hh=df_train_frequency.set_index('artist')['user'].to_dict()

In [55]:
artist_user_frequency= dict.fromkeys(range(artists_number), 0)
for k, v in hh.items():
    artist_user_frequency[dict_artists[k]]=v

In [56]:
#create sparse matrix for K-means
data=df_train['plays'].values
row=[dict_users[i] for i in df_train['user'].values]
col=[dict_artists[i] for i in df_train['artist'].values]
sparse_data=csr_matrix((data, (row, col)), shape=(users_number, artists_number))

In [160]:
def new_norm(x, y): #x, y rows from sparse matrix
    
    overlap=list(set(x.indices).intersection(set(y.indices)))
    
    if not overlap:
        #no overlap, then inf distance
        norm=np.inf
    else:
        #calculate distance only use overlap coordinates
        norm=np.linalg.norm(x[0, overlap].data-y[0, overlap].data)
    

In [201]:
from scipy.sparse import vstack
def cluster_points(X, mu):
    clusters  = {}
    
    for x in X:
        
        bestmukey = min([(i[0], new_norm(x, mu[i[0]])) for i in enumerate(mu)], key=lambda t:t[1])[0]
        
        if bestmukey not in clusters.keys():
            clusters[bestmukey] = x
        else:
            clusters[bestmukey]=vstack((clusters[bestmukey], x), format='csr')
          
    return clusters

In [187]:
def reevaluate_centers(mu, clusters):
    newmu = []
    
    keys = sorted(clusters.keys())
    
    for k in keys:
    
        newmu.append(np.array(csr_matrix.mean(clusters[k], axis=0))[0])
        
    return csr_matrix(np.array(newmu))

In [188]:
def has_converged(mu, oldmu):
    return (set([tuple(a) for a in mu]) != set([tuple(a) for a in oldmu]))

In [189]:
def find_centers(X, K, iters):
    # Initialize to K random centers
    oldmu = X[np.random.randint(0, X.shape[0],K)]
    mu = X[np.random.randint(0, X.shape[0],K)]
    iteration = 0
            
    while iteration<iters:
        oldmu = mu
            
        # Assign all points in X to clusters
        clusters = cluster_points(X, mu)
            
        # Reevaluate centers
        mu = reevaluate_centers(oldmu, clusters)
        
        iteration+=1
            
    return (mu, clusters)

In [None]:
mu, clusters=find_centers(sparse_data, 10, 1)

In [29]:
#To be continued, predicting !!!
dict_label_artists={}
for k in dict_label.keys():
    dict_label_artists[k]={}
    for v in dict_label[k]:
        for artist_k in train_data[v].keys():
            if not artist_k in dict_label_artists[k]:
                dict_label_artists[k][artist_k]=[train_data[v][artist_k]]
            else:
                dict_label_artists[k][artist_k].append(train_data[v][artist_k])

In [33]:
dict_label_artists_mean={}
for k in dict_label_artists.keys():
    dict_label_artists_mean[k]={}
    for v in dict_label_artists[k].keys():
        dict_label_artists_mean[k][v]=np.mean(dict_label_artists[k][v])      

In [None]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            
            user_id=dict_users[user]
            artist_id=dict_artists[artist]
            
            user_label=label_array[user_id]
            
            if artist_id not in dict_label_artists_mean[user_label]:
                soln_csv.writerow([id, 0])
            else:
                soln_csv.writerow([id, dict_label_artists_mean[user_label][artist_id]])       