In [158]:
import os
import pandas as pd
import numpy as np
import numpy 
import sys
from collections import *
pd.options.mode.chained_assignment = None
from tqdm import tqdm
import random

In [159]:
os.chdir('/Users/DennisLin/Documents/Python/CSE258/final/kkbox_recommender/')
data_path = './kkbox-music-recommendation-challenge/'
song_path = os.path.join(data_path, 'songs.csv')
song_info = os.path.join(data_path, 'song_extra_info.csv')
train_path = os.path.join(data_path, 'train.csv')

In [160]:
ls kkbox-music-recommendation-challenge/


members.csv               song_extra_info.csv       test.csv
members.csv.7z            song_extra_info.csv.7z    test.csv.7z
sample_submission.csv     songs.csv                 train.csv
sample_submission.csv.7z  songs.csv.7z              train.csv.7z


### Members

In [161]:
class Members():
    def __init__(self, path):
        self.members = pd.read_csv(path)
        self.preprocess()
    
    def preprocess(self):
        '''
        1. remove wierd age
        2. replace registers -> years
        3. add using date
        4. gender to three class
        '''
        #print info
        self.info()
        
        # 1
        wierd_age = list()
        for age, count in Counter(self.members['bd']).items():
            if count < 10:
                wierd_age.append(age)
        for drop_age in wierd_age:
            self.members = self.members[self.members['bd'] != drop_age]        
        
        #2 & 3
        self.members['registration_init_time'] = pd.to_datetime(self.members['registration_init_time'], format='%Y%m%d')
        self.members['expiration_date'] = pd.to_datetime(self.members['expiration_date'], format='%Y%m%d')
        self.members = self.members[self.members['expiration_date'] != '1970-01-01 00:00:00']
        self.members['activation_days'] = self.members['expiration_date'] - self.members['registration_init_time']
#         self.members['activation_days'].astype(int)
        days = self.members['activation_days'].values
        days = days.astype('timedelta64[D]')
        days = days / np.timedelta64(1, 'D')
        self.members['activation_days_int'] = days
        self.members['expiration_date'] = pd.DatetimeIndex(self.members['expiration_date']).year
        self.members['expiration_date'] -= min(self.members['expiration_date'])
        self.members['registration_init_time'] = pd.DatetimeIndex(self.members['registration_init_time']).year
        self.members['registration_init_time'] -= min(self.members['registration_init_time'])


        
        #4    
        self.members['gender'][self.members['gender'].isnull()] = 0
        self.members['gender'][self.members['gender'] == 'male'] = 1
        self.members['gender'][self.members['gender'] == 'female'] = -1
        
        #print info
        self.info()


    
    def info(self):
        print("The number of total users is: {}".format(len(self.members)))

In [162]:
members = Members(os.path.join(data_path, 'members.csv'))

The number of total users is: 34403
The number of total users is: 34288


### Songs

In [163]:
class clean_song_data():
    def __init__(self, song, song_info):
        self.song = pd.read_csv(song)
        self.song_info = pd.read_csv(song_info)
        self.result = pd.merge(self.song, self.song_info, on='song_id')
        self.filter_1()
        self.filter_2()
        self.add_count()
        self.convert_type()
        
    def filter_1(self):
        '''
        1. Filter out NaN
        2. Use only English songs
        
        '''
        header = []
        self.result = self.result.dropna()
#         for i in self.result.columns:
#             header.append(i)
#         for i in header:
#             self.result = self.result[pd.notnull(self.result[i])]
        
        # 52.0 = English
        self.result = self.result[self.result.language == 52.0]
    
    def filter_2(self):
        '''
        genre and artist count > 15
        
        '''
        header = ['genre_ids','artist_name']
        for i in header:
            k = pd.DataFrame(self.result.groupby(i).size().items())
            k = k.rename(columns={0: 'item', 1: 'count'})
            k = k[k['count']>15]
            self.result = self.result[self.result[i].isin(k['item'])]
            self.result.head()
            print(i,len(self.result))
            
    def add_count(self):
        '''
        add count for genre, composer 
        
        '''
        header = ['genre_ids', 'composer']
        name = ['gener_count', 'composer_count']
        for num,i in enumerate(header):
            l = []
            for j in self.result[i]:
                l.append(len(j.split('|')))
            self.result[name[num]] = l
            
    def convert_type(self):
        '''
        Convert str to list
        '''
        tmp = []
        for i in self.result['genre_ids'].tolist():
            tmp.append(list(map(int,i.split('|'))))
        self.result['genre_ids_list'] = tmp

In [164]:
song = clean_song_data(song_path, song_info)

genre_ids 168873
artist_name 121316


### Train

In [165]:
class Data():
    def __init__(self, path1, songs, users):
        self.train = pd.read_csv(path1)
        self.songs = songs
        self.users = users
        self.train = self.filterd(self.train)
        Ntrain = int(len(self.train)*0.8)
        Nval = int(len(self.train)*0.1)
        
        train = self.train[:Ntrain]
        val = self.train[Ntrain:Ntrain+Nval]
        test = self.train[Ntrain+Nval:]
        self.train = train
        self.val = val
        self.test = test
        
        self.train_p = self.train[self.train['target'] == 1]
        self.train_n = self.train[self.train['target'] == 0]
        self.ppl2songs_p = dict(self.train_p.groupby('msno')["song_id"].apply(list))
        self.ppl2songs_n = dict(self.train_n.groupby('msno')["song_id"].apply(list))
        self.songs2count = dict(self.train.groupby('song_id').size())
        self.song2user = dict(self.train.groupby("song_id")["msno"].apply(list))
#         self.songs_set = set(self.songs['song_id'])
#         self.all = pd.concat([self.train, self.test], axis=0, sort=False)
#         self.people2songs_all = self.all.groupby('msno')["song_id"].apply(set)
        
        self.user2idx, self.idx2user = self.getIDs(users.members)
        self.song2idx, self.idx2song = self.getIDs(songs.result)
    
    def filterd(self, pd):
        valid_user = set(self.users.members['msno'])
        valid_song = set(self.songs.result['song_id'])
        pd = pd[pd['msno'].isin(valid_user)]
        pd = pd[pd['song_id'].isin(valid_song)]
        return pd
    
    
    def getIDs(self, df):
        count = 0
        name2idx = dict()
        idx2name = dict()
        for row in df.values:
            name2idx[row[0]] = count
            idx2name[count] = row[0]
            count += 1
        return name2idx, idx2name
    
    def sample(self):
        out = []
        for ppl in tqdm(self.ppl2songs_p.keys()):
            for i, song in enumerate(self.ppl2songs_p[ppl]):
                if ppl in self.ppl2songs_n and i < len(self.ppl2songs_n[ppl]):
                    n_song = self.ppl2songs_n[ppl][i]
                else:
                    n_song = random.choice(list(set(self.songs.result['song_id']) - set(self.ppl2songs_p[ppl])))
                out.append((ppl, song, n_song))
        return out 
    
def sample(positive_data, all_songs, people2songs):
    negative_dict = dict()
    for people, count in tqdm(positive_data.groupby('msno').size().items()):
        negative_dict[people] = random.sample(all_songs - people2songs[people], count)
    append_list = []
    for people, songs in negative_dict.items():
        for song in songs:
            append_list.append([people, song, 0])
    positive_matrix = np.hstack((positive_data.values[:, :2], np.ones((positive_data.shape[0], 1))))
    return np.vstack((positive_matrix, np.array(append_list)))

In [183]:
data = Data(train_path, song, members)

In [184]:
data.train['target'].value_counts()

1    143272
0    114634
Name: target, dtype: int64

In [185]:
data.val.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
5722126,fPh0abPc3u8Bvle2s+dB4ah0fs6B5Z8IyrHqTC8A3fU=,g2IlsKlp+Sjmv1iP4x6tyCAY0/EquENkR8cYhOPSwzA=,my library,Local playlist more,local-playlist,1
5722129,CCmJDACtCS7oUjcM7pV2c+RJxdCynv9bHouPYnU0yzg=,UjdCZ+TB/6cNWIB1vF0RnvWuwH7J8slmBcEFlGJK884=,my library,Local playlist more,local-library,0
5722156,At/AMrMaeZ6Asnal0XlRj529HbBnr+DVD66AQrNsFl8=,IKMFuL0f5Y8c63Hg9BXkeNJjE0z8yf3gMt/tOxF4QNE=,discover,Online playlist more,online-playlist,0
5722167,K8N3FU9k9M1WQxtFI8BY2tlcMw/p6rygMdma8SAldpU=,78S6h6Qdw4DTKs2s95r+PjRfy9lgdlilt68WwUVpSQ0=,discover,Artist more,top-hits-for-artist,0
5722169,K8N3FU9k9M1WQxtFI8BY2tlcMw/p6rygMdma8SAldpU=,4+oY89VfVfYHPWSFj18mWvUBBUeCKsuRP/Ya1Pc53lE=,discover,Artist more,top-hits-for-artist,0


In [170]:
class Pop():
    def __init__(self, train, percentile=0.5):
        self.train = train
        self.pop = self.gen_pop(percentile)
    
    def gen_pop(self, percentile):
        counts = 0
        out_set = set()
        train_pos = []
        train_label = self.train['target'].values
        train_songid = self.train['song_id'].values
        for i in range(len(train_label)):
            if train_label[i] == 1:
                train_pos.append(train_songid[i])
        count = Counter(train_pos)
        s = [[j,i] for i,j in count.items()]
        s.sort(key=lambda tup: tup[0], reverse = True)
#         vc= self.train['song_id'].value_counts()
        for count, song in s:
            out_set.add(song)
            counts += count
            if counts > len(data.train)*percentile:
                break
        return out_set
    
    def baseline(self, u, song):
        if song in self.pop:
            return 1
        else:
            return 0
        
    
class Evaluation():
    def __init__(self, data):
        self.test = data.test
        
    def accuracy(self, model):
        test_matrix = self.test.values
        label = []
        pred = []
        for i in range(test_matrix.shape[0]):
            u, s, l = test_matrix[i][0], test_matrix[i][1], test_matrix[i][5]
            pred.append(model(u, s))
            label.append(l)
        
        return sum(np.array(pred) == np.array(label))/len(label)

In [186]:
model = Pop(data.train)
eval = Evaluation(data)
print("Accuracy of prediction is {}".format(eval.accuracy(model.baseline)))

Accuracy of prediction is 0.4796674834827383


### Jaccard

In [191]:
class Jac():
    def __init__(self, data, train):
        self.data = data
        self.train = train
        self.user2song = defaultdict(set)
        self.song2user = defaultdict(set)
        self.create()
        
    def create(self):
        self.train_pair = self.data.train[['msno','song_id']].values
        self.val_pair = self.data.val[['msno','song_id']].values
        self.test_pair = self.data.test[['msno','song_id']].values
        for user, song in self.train_pair:
            self.user2song[user].add(song)
            self.song2user[song].add(user)
        
    def Jaccard(self, s1, s2):
        numer = len(s1.intersection(s2))
        denom = len(s1.union(s2))
        return numer / denom

    def cal_acc(self, tmp):
        test_ground_truth = self.data.test['target'].values
        count = 0
        for num,i in enumerate(test_ground_truth):
            if test_ground_truth[num] == tmp[num]:
                count += 1
        acc = count/len(tmp)
        print('Jaccard accuracy on test set:{}'.format(acc))
        return acc

    def use_jac(self, thres):
        tmp = []
        count = 0
        for user, song in self.test_pair:
            count += 1
            for s in self.user2song[user]:
                score = self.Jaccard(self.song2user[song], self.song2user[s])
                if score > thres:
                    tmp.append(1)
                    break
            if len(tmp) != count:
                tmp.append(0)

        acc = self.cal_acc(tmp)
        return acc, tmp
    
    def use_jac_avg(self, thres):
        out = []
        count = 0
        for user, song in self.test_pair:
            tmp = []
            count += 1
            for s in self.user2song[user]:
                score = self.Jaccard(self.song2user[song], self.song2user[s])
                tmp.append(score)
            tmp.sort(reverse = True)
            if len(tmp[:3]) == 0:
                out.append(0)
                continue
            avg = sum(tmp[:3])/len(tmp[:3])
            if avg > thres:
                out.append(1)
                continue
            if len(out) != count:
                out.append(0)
        acc = self.cal_acc(out)
        return acc
        

    def use_jac_feature(self):
        out1, out2 = [], []
        for user, song in self.test_pair:
            tmp = []
            for u in self.song2user[song]:
                score = self.Jaccard(self.user2song[user], self.user2song[u])
                tmp.append(score)
            if len(tmp[:3]) == 0:
                out1.append(0)
                continue
            avg = sum(tmp[:3])/len(tmp[:3])
            out1.append(avg)
            

        for user, song in self.test_pair:
            tmp = []
            for s in self.user2song[user]:
                score = self.Jaccard(self.song2user[song], self.song2user[s])
                tmp.append(score)
            if len(tmp[:3]) == 0:
                out2.append(0)
                continue
            avg = sum(tmp[:3])/len(tmp[:3])
            out2.append(avg)
            
        return out1, out2
    
    def feature_song(self, user, song):
        tmp = []
        for u in self.song2user[song]:
            score = self.Jaccard(self.user2song[user], self.user2song[u])
            tmp.append(score)
        if len(tmp[:3]) == 0:
            top = 0
        else:
            top = sum(tmp[:3])/len(tmp[:3])
            
        if len(tmp[-3:]) == 0:
            tail = 0
        else:
            tail = sum(tmp[-3:])/len(tmp[-3:])
            
        if len(tmp[:]) == 0:
            avg = 0
        else:
            avg = sum(tmp[:])/len(tmp[:])
        return top,avg,tail
    
    def feature_user(self, user, song):
        tmp = []
        for s in self.user2song[user]:
            score = self.Jaccard(self.song2user[song], self.song2user[s])
            tmp.append(score)
        if len(tmp[:3]) == 0:
            top = 0
        else:
            top = sum(tmp[:3])/len(tmp[:3])
            
        if len(tmp[-3:]) == 0:
            tail = 0
        else:
            tail = sum(tmp[-3:])/len(tmp[-3:])
            
        if len(tmp[:]) == 0:
            avg = 0
        else:
            avg = sum(tmp[:])/len(tmp[:])
        return top,avg,tail

In [192]:
jac = Jac(data, data.train)


### One Class Model

In [16]:
data.train.groupby('msno')['target']

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1647e7dd8>

In [180]:
class OneClassModel():
# Randomly initializes two Matrices, Stochastic Gradient Descent to be able to optimize the best factorization for ratings.
    def __init__(self, learning_rate, num_epochs, num_factors, ld1, ld2):
       # super(surprise.AlgoBase)
        self.lr = learning_rate #learning rate for Stochastic Gradient Descent
        self.num_epochs = num_epochs
        self.num_factors = num_factors
        self.ld1 = ld1
        self.ld2 = ld2
              
    def fit(self, data):
        import math

        def sigmoid(gamma):
            if gamma <= 0:
                return 1 - 1/(1 + math.exp(gamma))
            else:
                return 1/(1 + math.exp(-gamma))

        #randomly initialize user/item factors from a Gaussian
        n_users = len(data.users.members)
        n_items = len(data.songs.result)
#         self.U = np.random.normal(scale=1./self.num_factors, size=((n_users,self.num_factors)))
#         self.I = np.random.normal(scale=1./self.num_factors, size=((n_items,self.num_factors)))
        self.U = np.random.normal(0,1e-3,(n_users,self.num_factors))
        self.I = np.random.normal(0,1e-3,(n_items,self.num_factors))
        self.data = data
        pre_mse = sys.maxsize
        for epoch in range(self.num_epochs):
#             random.shuffle(train_ls)
            for ppl in self.data.ppl2songs_p.keys():
#                 if epoch > 20:
#                     random.shuffle(self.data.ppl2songs_p[ppl])
#                     if ppl in self.data.ppl2songs_n:
#                         random.shuffle(self.data.ppl2songs_n[ppl])
                for i, song in enumerate(self.data.ppl2songs_p[ppl]):
                    u = data.user2idx[ppl]
                    ir = data.song2idx[song]
                    if ppl in self.data.ppl2songs_n and i < len(self.data.ppl2songs_n[ppl]):
                        inr = data.song2idx[self.data.ppl2songs_n[ppl][i]]
                    else:
                        continue
#                         inr = data.song2idx[random.choice(list(set(data.songs.result['song_id']) - set(data.ppl2songs_p[ppl])))]
#                         negative = -0.1
                    negative = self.estimate(u, inr)
                    positive = self.estimate(u, ir)
                    factor = sigmoid(negative - positive)
                    temp = self.U[u, :]
                    lr2 = self.lr
                    self.U[u,:] -=  lr2 * ((self.I[inr, :] - self.I[ir, :]) * factor + self.ld1 * self.U[u, :])
                    self.I[ir,:] -=  lr2 * ((-1*temp) * factor + self.ld2 * self.I[ir, :])
                    self.I[inr, :] -= lr2 * ((temp) * factor + self.ld2 * self.I[inr, :])
            accu = self.validate()
            print("Epochs: {} ; Accu = {}".format(epoch, accu))
#             if mse < pre_mse:
#                 pre_mse = mse
#             else:
#                 print("Epochs: {} ; Error = {}".format(epoch, mse))
#                 print('The error rate start to increase')
#                 break
            
                    
    def validate(self):
        label = []
        pred = []
        val = self.data.val[['msno', 'song_id', 'target']].values
        for idx in range(val.shape[0]):
            u = self.data.user2idx[val[idx, 0]]
            i = self.data.song2idx[val[idx, 1]]
            out = self.estimate(u, i)
            if out >= 0:
                pred.append(1)
            else:
                pred.append(0)
            label.append(val[idx, 2])
        return self.accuracy(pred, label)
    
    def estimate(self, uidx, iidx):
        try:
            out = np.dot(self.U[uidx, :], self.I[iidx, :].T)
#             if out >= 0:
#                 return 1
#             else:
#                 return 0
            return out
        except:
            print("Error Occured")
            return 0
#             return random.choice([0, 1])

    def accuracy(self, pred, label):
        return sum(np.array(pred) == np.array(label))/len(label)

In [187]:
OCM = OneClassModel(learning_rate=0.1, num_epochs=50, num_factors=1, ld1=0.01, ld2=0.2)
OCM.fit(data)

Epochs: 0 ; Accu = 0.4977976301259383
Epochs: 1 ; Accu = 0.49751845647993054
Epochs: 2 ; Accu = 0.49689807059991314
Epochs: 3 ; Accu = 0.49531608660586884
Epochs: 4 ; Accu = 0.49630870401389665
Epochs: 5 ; Accu = 0.5018611576400521
Epochs: 6 ; Accu = 0.5021713505800608
Epochs: 7 ; Accu = 0.5012407717600348
Epochs: 8 ; Accu = 0.5000620385880017
Epochs: 9 ; Accu = 0.5024815435200695
Epochs: 10 ; Accu = 0.5033500837520938
Epochs: 11 ; Accu = 0.5047149326881321
Epochs: 12 ; Accu = 0.5051181835101433
Epochs: 13 ; Accu = 0.5063899745641789
Epochs: 14 ; Accu = 0.508033997146225
Epochs: 15 ; Accu = 0.509460884670265
Epochs: 16 ; Accu = 0.5111359265463118
Epochs: 17 ; Accu = 0.5107016564302996
Epochs: 18 ; Accu = 0.5104224827842918
Epochs: 19 ; Accu = 0.5138346051243874
Epochs: 20 ; Accu = 0.5149512997084187
Epochs: 21 ; Accu = 0.5147651839444134
Epochs: 22 ; Accu = 0.5169055152304733
Epochs: 23 ; Accu = 0.5161300328804517
Epochs: 24 ; Accu = 0.5183634220485142
Epochs: 25 ; Accu = 0.51861157640

KeyboardInterrupt: 

In [188]:
OCM.estimate(1, 2)

-0.00013734866052374

### Ensemble Learner

### Train

In [None]:
train_ls = data.val[['msno', 'song_id', 'target']].values
# random.shuffle(train_ls)
features = []
for idx in range(train_ls.shape[0]):
    user, book_r, r = train_ls[idx, :]
    tmp = []
    u = data.user2idx[user]
    ir = data.song2idx[book_r]
    f1 = jac.feature_user(user, book_r)
    f2 = jac.feature_song(user, book_r)
#     tmp.append(data.songs2count[book_r])
#     tmp.extend(list(f1))
#     tmp.extend(list(f2))
    tmp.append(OCM.estimate(u, ir))
    tmp.append(r)
    features.append(tmp)
#     features.append([data.song2count[book_r], 
#                      len(data.train_userRatings[user]), f1, f3, OCM.estimate(u, ir), r])
features = np.array(features)
X = features[:, :-1]
Y = features[:, -1]
mean_X = np.mean(X, axis=0)
var_X = np.var(X, axis=0)
X_norm = X.copy()
X_norm = (X - mean_X)/var_X
X_norm = np.hstack((np.ones((X_norm.shape[0], 1)), X_norm))

### Validate

In [None]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X_norm,Y)

val_ls = data.val[['msno', 'song_id', 'target']].values
features = []
for user, book_r, r in val_ls:
    tmp = []
    u = data.user2idx[user]
    ir = data.song2idx[book_r]
    f1 = jac.feature_user(user, book_r)
    f2 = jac.feature_song(user, book_r)
    tmp.append(data.songs2count[book_r])
    tmp.extend(f1)
    tmp.extend(f2)
    tmp.append(OCM.estimate(u, ir))
    tmp.append(r)
    features.append(tmp)
features = np.array(features)
val_X, val_Y = features[:, :-1], features[:, -1]
val_X = (val_X - mean_X)/var_X
val_X = np.hstack((np.ones((val_X.shape[0], 1)), val_X))
pred = mod.predict(val_X)
print("C = {}, Accracy = {}".format(c, get_accuracy(pred, val_Y)))