In [16]:
import pandas as pd
import numpy as np
from collections import defaultdict
import scipy.io as sio
import scipy.sparse as ss
import pickle
from sklearn.model_selection import train_test_split

In [17]:
#读取数据
data = pd.read_csv("triplet_dataset_sub.csv")
data.head()

Unnamed: 0,user,song,play_count
0,4e11f45d732f4861772b2906f81a7d384552ad12,SOCKSGZ12A58A7CA4B,1
1,4e11f45d732f4861772b2906f81a7d384552ad12,SOCVTLJ12A6310F0FD,1
2,4e11f45d732f4861772b2906f81a7d384552ad12,SODLLYS12A8C13A96B,3
3,4e11f45d732f4861772b2906f81a7d384552ad12,SOEGIYH12A6D4FC0E3,1
4,4e11f45d732f4861772b2906f81a7d384552ad12,SOFRQTD12A81C233C0,2


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37519 entries, 0 to 37518
Data columns (total 3 columns):
user          37519 non-null object
song          37519 non-null object
play_count    37519 non-null int64
dtypes: int64(1), object(2)
memory usage: 879.4+ KB


In [19]:
X_data = data.drop(['play_count'], axis=1)
y_data = data['play_count']

#划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [20]:
train_data.head()

Unnamed: 0,user,song,play_count
23616,2c2790c0ce23f00f1b642fd356ec6854a133d083,SOPGCOH12A8C139789,1
28787,4ea4ef23853df11d4321bc458471a7b42aeb54cf,SOMWCVL12AF729E81A,2
36953,53cc3e95468819addbfcaa1256b460984c581be3,SOWSPUS12AC468BEE3,8
1079,6ccd111af9b4baa497aacd6d1863cbf5a141acc6,SOVXCNA12A58A7881A,1
4859,79a1ba30cec2ee1509f3c2df697916fe901b37e9,SOGSAYQ12AB018BA14,1


In [21]:
test_data.head()

Unnamed: 0,user,song,play_count
28439,f8181f9b3d85fa4ac04c66bc9f84f0ad2a18a777,SOPUELG12A6701D215,4
20964,6bfaf522b9bfb59370d8adfa7f6093579502d775,SOVVENS12A8C144DB5,17
27315,002b63a7e2247de6d62bc62f253474edc7dd044c,SOMCMKG12A8C1347BF,6
12408,fe9a05c03c29da973743a83b80d1660748077432,SOMDVSL12A6D4F7230,8
2749,1820cfffd52cad7b3af398f379524d51579655d2,SOWIMTL12A8C1386DC,1


In [22]:
#获取所有用户和所有歌曲的ID标识
total_users_id = train_data['user'].unique()
total_songs_id = train_data['song'].unique()
print('total user {}, total song {}'.format(len(total_users_id), len(total_songs_id)))

total user 787, total song 800


In [23]:
#对用户和歌曲ID进行编号
users_index = dict()
songs_index = dict()

for index, user in enumerate(total_users_id):
    users_index[user] = index
    print('user:{}, index:{}'.format(user, index))

for index, song  in enumerate(total_songs_id):
    songs_index[song] = index
    print('song:{}, index:{}'.format(song, index))

user:2c2790c0ce23f00f1b642fd356ec6854a133d083, index:0
user:4ea4ef23853df11d4321bc458471a7b42aeb54cf, index:1
user:53cc3e95468819addbfcaa1256b460984c581be3, index:2
user:6ccd111af9b4baa497aacd6d1863cbf5a141acc6, index:3
user:79a1ba30cec2ee1509f3c2df697916fe901b37e9, index:4
user:4aa35a6181dc8ba9635790467dcf4f0d57fee0c0, index:5
user:99259b34141119e5577009ae478cb9bf3f47f34d, index:6
user:67874d1a189c83326c529e554be6f7acf55effae, index:7
user:4552b85a1b315556ad50d4a10942a3e86fc7d72c, index:8
user:2102cfa15b3702123d179c1559717e1886170139, index:9
user:625d0167edbc5df88e9fbebe3fcdd6b121a316bb, index:10
user:c7417a59a6d67ef869bf970671b5246c4e3e16d6, index:11
user:8ec7fd0c1acf1dbe44720e5eab44dbe524eb6caf, index:12
user:f2a03543373cfed80f076f3337360630f084ad30, index:13
user:860fb8adde6bc5718f49cc995a8655be11c6536c, index:14
user:113255a012b2affeab62607563d03fbdf31b08e7, index:15
user:6a8fb4968a5f3b0a5b3708258e2582f66001b15a, index:16
user:63aa3e991a33c309016051541b8084ff0aec7284, index:17
us

In [24]:
#数据保存
pickle.dump(total_users_id, open('total_users_id.pkl', 'wb'))
pickle.dump(total_songs_id, open('total_songs_id.pkl', 'wb'))
pickle.dump(users_index, open('users_index.pkl', 'wb'))
pickle.dump(songs_index, open('songs_index.pkl', 'wb'))

In [25]:
#记录用户对歌曲是否感兴趣，个人认为至少播放2次才是感兴趣
interest = np.array(train_data['play_count'])
interest[interest < 2] = 0
interest[interest >= 2] = 1
train_data['interest'] = interest
train_data.head()

Unnamed: 0,user,song,play_count,interest
23616,2c2790c0ce23f00f1b642fd356ec6854a133d083,SOPGCOH12A8C139789,1,0
28787,4ea4ef23853df11d4321bc458471a7b42aeb54cf,SOMWCVL12AF729E81A,2,1
36953,53cc3e95468819addbfcaa1256b460984c581be3,SOWSPUS12AC468BEE3,8,1
1079,6ccd111af9b4baa497aacd6d1863cbf5a141acc6,SOVXCNA12A58A7881A,1,0
4859,79a1ba30cec2ee1509f3c2df697916fe901b37e9,SOGSAYQ12AB018BA14,1,0


In [35]:
#根据次数转换为分数
min_count = train_data['play_count'].min()
max_count = train_data['play_count'].max()
diff = max_count - min_count
cut_list = [0, min_count+diff//5, min_count+diff*2//5, min_count+diff*3//5, min_count+diff*4//5, min_count+diff]
labels = [1, 2, 3, 4, 5]
train_data['score'] = pd.cut(train_data['play_count'], cut_list, labels=labels)
train_data.head()

Unnamed: 0,user,song,play_count,interest,score
23616,2c2790c0ce23f00f1b642fd356ec6854a133d083,SOPGCOH12A8C139789,1,0,1
28787,4ea4ef23853df11d4321bc458471a7b42aeb54cf,SOMWCVL12AF729E81A,2,1,1
36953,53cc3e95468819addbfcaa1256b460984c581be3,SOWSPUS12AC468BEE3,8,1,1
1079,6ccd111af9b4baa497aacd6d1863cbf5a141acc6,SOVXCNA12A58A7881A,1,0,1
4859,79a1ba30cec2ee1509f3c2df697916fe901b37e9,SOGSAYQ12AB018BA14,1,0,1


In [44]:
#统计用户听过的歌曲，歌曲有哪些用户听过
user_song_dict = defaultdict(set)
song_user_dict = defaultdict(set)

# #记录用户对歌曲是否感兴趣和次数
user_song_interest = ss.dok_matrix((len(total_users_id), len(total_songs_id)))
user_song_score = ss.dok_matrix((len(total_users_id), len(total_songs_id)))

for index, row in train_data.iterrows():
    cur_user_id = users_index[row['user']]
    cur_song_id = songs_index[row['song']]
    
    if(row.interest != 0):
        #记录用户听过的歌曲
        user_song_dict[cur_user_id].add(cur_song_id)

        #记录听过歌曲的用户
        song_user_dict[cur_song_id].add(cur_user_id)

        #记录用户对歌曲的兴趣
        user_song_interest[cur_user_id, cur_song_id] = row['interest']
        user_song_score[cur_user_id, cur_song_id] = row['score']

IndentationError: expected an indented block (<ipython-input-44-165217b962d1>, line 15)

In [37]:
user_song_score

<787x800 sparse matrix of type '<class 'numpy.float64'>'
	with 30015 stored elements in Dictionary Of Keys format>

In [38]:
sio.mmwrite("user_song_interest", user_song_interest)
sio.mmwrite("user_song_score", user_song_interest)

In [29]:
#存储数据
pickle.dump(user_song_dict, open('user_song_dict.pkl', 'wb'))
pickle.dump(song_user_dict, open('song_user_dict.pkl', 'wb'))

In [42]:
#保存测试数据
interest = np.array(test_data['play_count'])
interest[interest < 2] = 0
interest[interest >= 2] = 1
test_data['interest'] = interest

#根据次数转换为分数
min_count = test_data['play_count'].min()
max_count = test_data['play_count'].max()
diff = max_count - min_count
cut_list = [0, min_count+diff//5, min_count+diff*2//5, min_count+diff*3//5, min_count+diff*4//5, min_count+diff]
labels = [1, 2, 3, 4, 5]
test_data['score'] = pd.cut(test_data['play_count'], cut_list, labels=labels)
test_data.head()

Unnamed: 0,user,song,play_count,interest,score
28439,f8181f9b3d85fa4ac04c66bc9f84f0ad2a18a777,SOPUELG12A6701D215,4,1,1
20964,6bfaf522b9bfb59370d8adfa7f6093579502d775,SOVVENS12A8C144DB5,17,1,1
27315,002b63a7e2247de6d62bc62f253474edc7dd044c,SOMCMKG12A8C1347BF,6,1,1
12408,fe9a05c03c29da973743a83b80d1660748077432,SOMDVSL12A6D4F7230,8,1,1
2749,1820cfffd52cad7b3af398f379524d51579655d2,SOWIMTL12A8C1386DC,1,0,1


In [43]:
test_data.to_csv('test_data.csv', index=True)