# Content-based recommendation

In [1]:
import numpy as np
import pandas as pd

### Read and clean raw data

In [3]:
f = pd.read_csv('song_features.csv')
f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 91 columns):
song_id               9839 non-null object
TimbreAvg1            9839 non-null float64
TimbreAvg2            9839 non-null float64
TimbreAvg3            9839 non-null float64
TimbreAvg4            9839 non-null float64
TimbreAvg5            9839 non-null float64
TimbreAvg6            9839 non-null float64
TimbreAvg7            9839 non-null float64
TimbreAvg8            9839 non-null float64
TimbreAvg9            9839 non-null float64
TimbreAvg10           9839 non-null float64
TimbreAvg11           9839 non-null float64
TimbreAvg12           9839 non-null float64
TimbreCovariance1     9839 non-null float64
TimbreCovariance2     9839 non-null float64
TimbreCovariance3     9839 non-null float64
TimbreCovariance4     9839 non-null float64
TimbreCovariance5     9839 non-null float64
TimbreCovariance6     9839 non-null float64
TimbreCovariance7     9839 non-null float64
TimbreCovarian

In [47]:
np_f = f.drop('song_id',axis=1).values

In [50]:
np_f[np.isnan(np_f)]= 0

### PCA

In [53]:
# reduce the features to 3 dimension and convert it to RGB
from sklearn.decomposition import PCA
pca = PCA(n_components=3,random_state=123)

In [54]:
f_3d = pca.fit_transform(np_f)

In [55]:
df_f3d = pd.DataFrame(f_3d)

### Convert 3 dimension data to RGB

In [56]:
from sklearn.preprocessing import MinMaxScaler

In [57]:
df_rgb = MinMaxScaler(feature_range=(0, 255)).fit_transform(df_f3d)

In [58]:
df_rgb=df_rgb.round()

In [59]:
df_rgb = pd.DataFrame(df_rgb,columns=['r','g','b'])

In [60]:
df_rgb.head()

Unnamed: 0,r,g,b
0,12.0,105.0,107.0
1,21.0,96.0,112.0
2,13.0,105.0,108.0
3,22.0,95.0,113.0
4,12.0,101.0,107.0


### Calculate similarities

In [61]:
# Use consine similarity to find the index of most similar songs
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
s = cosine_similarity(df_rgb)

In [63]:
s = np.argsort(s,axis=1)

In [64]:
s = s[:,-9:]
s = s[:,::-1]
s

array([[   0, 6905, 4425, ..., 4168, 2717, 2956],
       [3810,    1,  411, ..., 3524, 3104, 1254],
       [9231,    2, 2152, ..., 8069, 3413, 8186],
       ...,
       [9837, 9735, 2735, ...,  560, 8459, 4029],
       [9838, 9030, 4853, ..., 5857, 9142, 2304],
       [9839, 9519, 4682, ..., 6596, 2837, 3164]], dtype=int64)

### Find song_id for recommended songs

In [70]:
df = f.song_id
result = df

In [71]:
for i in range(9):
    dfi = df.iloc[s[:,i]].reset_index()
    dfi = dfi['song_id']
    dfi = pd.DataFrame(dfi)
    dfi.columns=['song_id'+str(i+1)]
    result = pd.concat([result,dfi],sort=False,axis=1)

In [73]:
df = result[['song_id']+['song_id'+str(i) for i in range(1,10)]]

In [75]:
df.head()

Unnamed: 0,song_id,song_id1,song_id2,song_id3,song_id4,song_id5,song_id6,song_id7,song_id8,song_id9
0,SOHNWIM12A67ADF7D9,SOHNWIM12A67ADF7D9,SOIKEMW12A8C13D826,SONVIOU12AB0181500,SOJRBZX12AB017D20E,SOPMAEU12A67AE0C81,SOZPAMO12A8C140C32,SOUWSEZ12A81C20F90,SOFCQGW12A58A7BC25,SOJDNPX12A6310E10F
1,SOECFIW12A8C144546,SOCVMXL12A6D4F6EE7,SOECFIW12A8C144546,SOOSVBQ12AAF3B360E,SONLCTW12A58A7BB70,SOMSTPF12AF729F337,SOTSTAA12AB017F16C,SOPFFKV12AF729FD3A,SOXKQUX12AF72AA82A,SOOPYMV12A6D4FC0D7
2,SOGWEOB12AB018A4D0,SOUPNIS12AF72A6FB0,SOGWEOB12AB018A4D0,SOILCEB12AF72AD86D,SOBPTVX12AB017B9AD,SOKWEAS12A8AE45448,SOWIGCV12A6D4F6A35,SOLBQLF12AB018612F,SODRMUG12A58A79A4E,SOXVXDS12AF72A4704
3,SOJGCRL12A8C144187,SOJGCRL12A8C144187,SOHLXSM12A6D4F9413,SOQKMWH12AF72A6B41,SOYKTWE12A8C13A164,SOVIIOV12A67020315,SOQDAYK12A8C13E11E,SODFMEY12A8C1353BD,SOPHQPE12AB0183798,SOFLMTG12A58A7D556
4,SOHNFBA12AB018CD1D,SOHNFBA12AB018CD1D,SOKTHPP12A8AE47F2A,SOWZOTZ12A6D4F8A0A,SOXKBHN12AB0185115,SOVTDJR12A8AE47EDA,SOKZRSH12A8C13CA3D,SOQZGYS12AF72A23DA,SOCNYYO12A6D4F910B,SOPSDYY12A8C13E1D2


### Save to csv

In [76]:
similarity = pd.concat([df,df_rgb],sort=False,axis=1)

In [77]:
similarity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 13 columns):
song_id     9839 non-null object
song_id1    9839 non-null object
song_id2    9839 non-null object
song_id3    9837 non-null object
song_id4    9839 non-null object
song_id5    9839 non-null object
song_id6    9838 non-null object
song_id7    9840 non-null object
song_id8    9839 non-null object
song_id9    9840 non-null object
r           9840 non-null float64
g           9840 non-null float64
b           9840 non-null float64
dtypes: float64(3), object(10)
memory usage: 1.4+ MB


In [200]:
similarity.to_csv('similarity.csv')