In [3]:
import numpy as np
import csv
import pandas as pd
import scipy.sparse as spr
from sklearn.cluster import KMeans
from math import isnan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
sns.set_style("whitegrid")
sns.set_context("poster")
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
%matplotlib inline

In [4]:
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
profiles_file = 'data/profiles.csv'
artists_file = 'data/artists.csv'

In [None]:
import musicbrainzngs
musicbrainzngs.set_useragent("Example music app", "0.1", "http://example.com/music")
artist_id = "c5c2ea1c-4bde-4f4d-bd0b-47b200bf99d6"
def get_artist_info(ids):
    results = dict()
    for id in ids:
        try:
            result = musicbrainzngs.get_artist_by_id(id, includes=['tags','ratings'])
            results[id] = result
        except Exception as e:
            print id
            print("Something went wrong with the request: %s" + str(e))
    return results

In [None]:
artist_infos = get_artist_info(uiids)
import json
with open('data/artist_infos','w') as f:
    json.dump(artist_infos, f)

### User Profile Matrix
#### Load the User data into a DataFrame

In [5]:
profiles = pd.read_csv(profiles_file, header=0)
user_ids = profiles.user

In [6]:
profile_data = pd.DataFrame({'sex': profiles.sex.apply(lambda s: -1 if type(s) == float and np.isnan(s) else s).values, \
                                 'age': profiles.age.apply(lambda s: int(s) if not isnan(s) else -1).values, \
                                 'country': profiles.country.values}, \
                                index=user_ids.values)

In [7]:
profile_data.head()

Unnamed: 0,age,country,sex
fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,25,Sweden,f
5909125332c108365a26ccf0ee62636eee08215c,29,Iceland,m
d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,30,United States,m
63268cce0d68127729890c1691f62d5be5abd87c,21,Germany,m
02871cd952d607ba69b64e2e107773012c708113,24,Netherlands,m


In [14]:
columns = list(set(profile_data.sex)) + list(set(profile_data.country)) + [15, 20, 25, 30, 35, 40, 45, 50]

In [15]:
#This was helpful: http://stackoverflow.com/questions/2272149/round-to-5-or-other-number-in-python
def custom_round(x, base=5):
    rounded = int(base * round(float(x)/base))
    if rounded <15:
        return 15
    elif rounded > 50:
        return 50
    else:
        return rounded

#### Create a Matrix with 1 and 0 with a auser has a certaint characteristic

In [24]:
# User Profile Matrix
profile_matrix = np.zeros((len(profile_data.index.values), len(columns)))
loop_size = len(profile_data.index.values)

for i, user in enumerate(profile_data.index.values):
    profile = profile_data.ix[user]

    #Sex indices
    sex = profile['sex']
    sex_col = columns.index(sex)
    profile_matrix[i, sex_col] = 1
    
        
    # Country indices
    country = profile['country']
    country_col = columns.index(country)
    profile_matrix[i, country_col] = 1
    
    # Age indices
    age = custom_round(profile['age'])
    profile_matrix[i, columns.index(age)] = 1


In [25]:
user_df = pd.DataFrame(profile_matrix, columns = columns)

In [26]:
user_df.head()

Unnamed: 0,m,-1,f,Canada,Libyan Arab Jamahiriya,Sao Tome and Principe,Turkmenistan,Saint Helena,Montenegro,Lithuania,Saint Pierre and Miquelon,Saint Kitts and Nevis,Ethiopia,Aruba,Sri Lanka,Swaziland,Argentina,Bolivia,Cameroon,Burkina Faso,Bahrain,Saudi Arabia,American Samoa,Montserrat,United States Minor Outlying Islands,Cocos (Keeling) Islands,Slovenia,Guatemala,Bosnia and Herzegovina,Russian Federation,Jordan,"Virgin Islands, British",Spain,Liberia,Netherlands,Armenia,Pakistan,Oman,Marshall Islands,Christmas Island,Gabon,Niue,Finland,Monaco,Bahamas,Wallis and Futuna,New Zealand,Yemen,Jamaica,Albania,...,Suriname,Anguilla,Venezuela,Netherlands Antilles,Israel,Bouvet Island,Iceland,Zambia,Senegal,Papua New Guinea,Zimbabwe,Germany,Martinique,Kazakhstan,Poland,Mauritania,Kyrgyzstan,Mayotte,British Indian Ocean Territory,"Korea, Democratic People's Republic of",New Caledonia,Macedonia,Trinidad and Tobago,Latvia,Hungary,Guadeloupe,Greenland,Honduras,Myanmar,Mexico,Egypt,Cuba,Serbia,Comoros,United Kingdom,Antarctica,Congo,Heard Island and Mcdonald Islands,Greece,Paraguay,Fiji,"Palestinian Territory, Occupied",15,20,25,30,35,40,45,50
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [28]:
traindf = pd.DataFrame.from_csv('data/traindf2')
validatedf = pd.DataFrame.from_csv('data/validatedf2')

In [29]:
validatedf.head()

Unnamed: 0,user,artist,plays
2980218,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
3010772,00000c289a1829a808ac09c00daf10bc3c4e223b,8000598a-5edb-401c-8e6d-36b167feaf38,222
3748271,00001411dc427966b17297bf4d69e7e193135d89,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,622
4103333,00001411dc427966b17297bf4d69e7e193135d89,451f9db1-f75f-44f9-b218-f8bdf22035a1,2427
3592149,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,2449300a-6ca7-45da-8102-22789d256475,14


In [30]:
uuids = traindf.user.unique()#unique-users
uiids = traindf.artist.unique()#unique-items
uuidmap={v:k for k,v in enumerate(uuids)}#of length U
uiidmap={v:k for k,v in enumerate(uiids)}#of length M

In [33]:
import json
with open('data/artist_infos','r') as f:
    artist_infos = json.load(f)

In [74]:
for v in artist_infos.values():
    print v 

{u'artist': {u'name': u'Liars', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States', u'iso-3166-1-code-list': [u'US']}, u'country': u'US', u'life-span': {u'begin': u'2000'}, u'tag-list': [{u'count': u'1', u'name': u'alternative rock'}, {u'count': u'1', u'name': u'art punk'}, {u'count': u'1', u'name': u'dance-punk'}, {u'count': u'1', u'name': u'experimental rock'}, {u'count': u'1', u'name': u'indie rock'}, {u'count': u'1', u'name': u'noise rock'}, {u'count': u'1', u'name': u'rock'}, {u'count': u'1', u'name': u'rock and indie'}], u'id': u'03098741-08b3-4dd7-b3f6-1b0bfa2c879c', u'begin-area': {u'sort-name': u'Brooklyn', u'id': u'a71b0d32-7752-49e9-8594-2247ad6ac12c', u'name': u'Brooklyn'}, u'sort-name': u'Liars', u'type': u'Group', u'isni-list': [u'0000000107254546']}}
{u'artist': {u'rating': {u'rating': u'5', u'votes-count': u'2'}, u'name': u'CunninLynguists', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-

In [70]:
unique_countries = set([artist_infos[artist]['artist']['country'] for artist in artist_infos if 'country' in artist_infos[artist]['artist'].keys()])

In [50]:
from collections import defaultdict
tags = defaultdict(list)
for k,v in artist_infos.iteritems():
    artist = k
    try:
        taglist = v['artist']['tag-list']
        for i,e in enumerate(taglist):
            tag = e['name']
            tags[artist].append(tag)
            if i==2: break
    except:
        pass

no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags
no tags


In [53]:
for tag in tags.values()[:10]:
    print tag

[u'alternative rock', u'art punk', u'dance-punk']
[u'american', u'conscious hip hop', u'hip hop']
[u'rock and indie']
[u'bach goldberg variations', u'bach the goldberg variations', u'brahms']
[u'rock and indie']
[u'1980', u'alternative dance', u'alternative rock']
[u'colombia', u'colombian', u'cumbia']
[u'austria', u'austrian', u'black metal']
[u'american', u'dance-pop', u'new wave']
[u'alternative metal', u'american', u'death metal']


In [60]:
unique_tags = np.unique([tag for tag in tags.values() for tag in tag])

In [72]:
artist_cols = ['male','female'] + list(unique_countries) + list(unique_tags)

In [95]:
artist_cols.index('female')

1

In [109]:
# User Profile Matrix
artist_matrix = np.zeros((len(artist_infos), len(artist_cols)))

for i, artist in enumerate(artist_infos):
    profile = artist_infos[artist]['artist']
    try:
        sex = profile['gender']
        if sex is 'Male':
            male_col = artist_cols.index('male')
            artist_matrix[i, male_col] = 1
        if sex is 'Female':
            female_col = artist_cols.index('female')
            artist_matrix[i, female_col] = 1
    except:
        pass
    # Country indices
    try:
        country = profile['country']
        country_col = artist_cols.index(country)
        artist_matrix[i, country_col] = 1
    except:
        pass
    
    # Age indices
    try:
        taglist = tags[artist]
        for tag in taglist:
            tag_col = artist_cols.index(tag)
            artist_matrix[i, tag_col] = 1
    except:
        pass


In [85]:
for i, artist in enumerate(artist_infos):
    profile = artist_infos[artist]['artist']
    print profile
    #country = profile['country']
    #print country
    tag = tags[artist]
    print tag
    if i== 3: break

{u'name': u'Liars', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States', u'iso-3166-1-code-list': [u'US']}, u'country': u'US', u'life-span': {u'begin': u'2000'}, u'tag-list': [{u'count': u'1', u'name': u'alternative rock'}, {u'count': u'1', u'name': u'art punk'}, {u'count': u'1', u'name': u'dance-punk'}, {u'count': u'1', u'name': u'experimental rock'}, {u'count': u'1', u'name': u'indie rock'}, {u'count': u'1', u'name': u'noise rock'}, {u'count': u'1', u'name': u'rock'}, {u'count': u'1', u'name': u'rock and indie'}], u'id': u'03098741-08b3-4dd7-b3f6-1b0bfa2c879c', u'begin-area': {u'sort-name': u'Brooklyn', u'id': u'a71b0d32-7752-49e9-8594-2247ad6ac12c', u'name': u'Brooklyn'}, u'sort-name': u'Liars', u'type': u'Group', u'isni-list': [u'0000000107254546']}
[u'alternative rock', u'art punk', u'dance-punk']
{u'rating': {u'rating': u'5', u'votes-count': u'2'}, u'name': u'CunninLynguists', u'area': {u'sort-name': u'United States'

In [110]:
artist_df = pd.DataFrame(artist_matrix, columns=artist_cols)
artist_df.head(10)

Unnamed: 0,male,female,BE,FR,DK,BM,DE,JP,JM,BR,JE,FI,PR,RU,NL,PT,NO,CV,NZ,CU,IS,PL,CH,CO,GR,CA,IT,ZA,AR,AU,IL,IN,IE,XW,ES,UY,ML,US,GB,SI,SN,UA,MX,SE,AT,"""ur so fail""",*england→north east→wallsend,+usa california,00s,1,...,soul and reggae,soundtrack,soundtrack composer,south american,southern rock,space rock,spanish,special purpose artist,speed metal,steal your goals,stevie ray vaughan,stockholm,stoner metal,stoner rock,stoner sludge metal,straight edge,summer song,supergroup,supremo,surf,sverige,sweden,swedish,swing,symphonic metal,synth pop,synth rock,synthpop,synthpunk,technical death metal,texas,the calling,thrash metal,todo-works,tracker,trance,travis barker,trip hop,uk,uk garage,underground hip-hop,united kingdom,united states,urban folk,us,usa,vocalist,wellerworld,winnipeg,world
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [160]:
import scipy.sparse as sp
def getmats(indf):
    matrix = np.zeros([indf.shape[0]], dtype=object)
    plays = np.zeros([indf.shape[0]])
    for i,row in enumerate(indf.values):
        newrow = np.zeros(len(columns) + len(artist_cols))
        user = row[0]
        artist = row[1]
        plays[i] = row[2]
        user_ix = uuidmap[user]
        artist_ix = uiidmap[artist]  
        try:
            newrow[:len(columns)] = user_df.loc[user_ix]
            newrow[len(columns):] = artist_df.loc[artist_ix]
        except:
            pass
        newrow = sp.coo_matrix(newrow)
        matrix[i] = newrow
    matrix = sp.vstack(matrix)
    return matrix, plays

In [159]:
%%time
traindm, trainplays = getmats(traindf)
validatedm, validateplays = getmats(validatedf)

boo
CPU times: user 4min 36s, sys: 7.02 s, total: 4min 43s
Wall time: 4min 51s


In [148]:
validatedf.values[5]

array(['000063d3fe1cf2ba248b9e3c3f0334845a27a6bf',
       '61ed9c9c-79eb-4e8f-8015-bd599ac0ab49', 11], dtype=object)

In [136]:
from sklearn.linear_model import Ridge

In [162]:
ridge = Ridge()
ridge.fit(validatedm, validateplays)


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [163]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(validateplays, ridge.predict(validatedm))

234.12769759929049