In [3]:
%matplotlib inline
import pandas
from sklearn.model_selection import train_test_split
import numpy as np
import pylab as pl

In [4]:
song_df = pandas.read_csv('song_data.csv')

In [5]:
print(len(song_df))

1116609


### 1

In [6]:
song_df_1 = song_df[song_df['artist'] == 'Taylor Swift']
taylor_users = song_df_1['user_id'].nunique()

print(taylor_users)

3246


### 2

In [7]:
song_df_2 = song_df

artist_group = song_df_2.groupby('artist')['listen_count'].sum().sort_values(ascending=False)

print(artist_group.index[2], "-", artist_group.iloc[2])
print(artist_group.index[-3], "-", artist_group.iloc[-3])

Björk - 38889
Beyoncé feat. Bun B and Slim Thug - 26


### 3

In [8]:
song_df_3 = song_df.head(10000)

In [9]:
new_train_data, new_test_data = train_test_split(song_df_3, test_size = 0.20, random_state=0)

In [10]:
class new_item_similarity_recommender_py():
    def __init__(self):
        # Змінні для збереження даних та параметрів моделі
        self.train_data = None
        self.user_id = None
        self.item_id = None

    def get_user_items(self, user):
        # Отримати список пісень, які слухав даний користувач
        user_data = self.train_data[self.train_data[self.user_id] == user]
        return list(user_data[self.item_id].unique())

    def get_item_users(self, item):
        # Отримати користувачів, які слухали цю пісню
        item_data = self.train_data[self.train_data[self.item_id] == item]
        return set(item_data[self.user_id].unique())

    def get_all_items_train_data(self):
        # Отримати всі унікальні пісні з навчального датасету
        return list(self.train_data[self.item_id].unique())

    def construct_cooccurence_matrix(self, user_songs, all_songs):
        # Крок A: Отримати слухачів кожної з пісень користувача
        user_songs_users = [self.get_item_users(song) for song in user_songs]

        # Крок B: Ініціалізуємо матрицю (user_songs x all_songs)
        cooccurence_matrix = np.matrix(np.zeros((len(user_songs), len(all_songs))), float)

        # Крок C: Обчислення подібності між піснями користувача та всіма унікальними піснями в навчальних даних
        for i in range(len(all_songs)):
            users_i = self.get_item_users(all_songs[i])  # слухачі пісні i
            for j in range(len(user_songs)):
                users_j = user_songs_users[j]  # слухачі пісні j користувача
                intersection = users_i.intersection(users_j)
                if intersection:
                    union = users_i.union(users_j)
                    cooccurence_matrix[j, i] = float(len(intersection)) / float(len(union))
                else:
                    cooccurence_matrix[j, i] = 0.0
        return cooccurence_matrix

    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        # Крок A: Підрахунок середньої схожості по рядках
        user_sim_scores = cooccurence_matrix.sum(axis=0) / float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()

        # Крок B: Сортуємо за зменшенням score
        sort_index = sorted(((e, i) for i, e in enumerate(user_sim_scores)), reverse=True)

        # Крок C: Створюємо DataFrame з результатами
        columns = ['user_id', 'song', 'score', 'rank']
        df = pandas.DataFrame(columns=columns)

        # Крок D: Додаємо найкращі 10 рекомендацій, уникаючи пісень, які користувач вже слухав
        rank = 1
        for score, i in sort_index:
            if not np.isnan(score) and all_songs[i] not in user_songs and rank <= 10:
                df.loc[len(df)] = [user, all_songs[i], score, rank]
                rank += 1

        return df if not df.empty else -1  # -1 якщо немає нових пісень

    def create(self, train_data, user_id, item_id):
        # Ініціалізуємо модель даними
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    def recommend(self, user):
        # Повний цикл генерації рекомендацій для користувача
        user_songs = self.get_user_items(user)
        all_songs = self.get_all_items_train_data()
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        return self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)

In [11]:
class new_popularity_recommender_py(new_item_similarity_recommender_py):
    def recommend_one_song(self, user):
        user_songs = self.get_user_items(user)
        all_songs = self.get_all_items_train_data()

        # Матриця збігів (co-occurrence matrix)
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)

        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)

        return df_recommendations.iloc[0] if isinstance(df_recommendations, pandas.DataFrame) and not df_recommendations.empty else None

In [12]:
recommender = new_popularity_recommender_py()
recommender.create(train_data=new_train_data, user_id='user_id', item_id='song')

unique_users = song_df['user_id'].unique()[:3]

top1_recommendations = {user: recommender.recommend_one_song(user) for user in unique_users}
top1_recommendations

{'b80344d063b5ccb3212f76538f3d9e43d87dca9e': user_id    b80344d063b5ccb3212f76538f3d9e43d87dca9e
 song                   Your Protector - Fleet Foxes
 score                                      0.038552
 rank                                              1
 Name: 0, dtype: object,
 '85c1f87fea955d09b4bec2e36aee110927aedf9a': user_id    85c1f87fea955d09b4bec2e36aee110927aedf9a
 song                           Puppets - Atmosphere
 score                                           0.5
 rank                                              1
 Name: 0, dtype: object,
 'bd4c6e843f00bd476847fb75c47b4fb430a06856': user_id    bd4c6e843f00bd476847fb75c47b4fb430a06856
 song        Dead Souls [Re-mastered] - Joy Division
 score                                      0.138889
 rank                                              1
 Name: 0, dtype: object}

In [13]:
# Тут пісні сортуються за кількістю прослуховувань з використанням агрегації
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum  = song_grouped['listen_count'].sum()

song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values( by='percentage', ascending=False).head(10)

Unnamed: 0,song,listen_count,percentage
7128,Sehr kosmisch - Harmonia,5970,0.534654
9083,Undo - Björk,5281,0.47295
9879,You're The One - Dwight Yoakam,4806,0.43041
2068,Dog Days Are Over (Radio Edit) - Florence + Th...,4536,0.40623
6775,Revelry - Kings Of Leon,4339,0.388587
3614,Horn Concerto No. 4 in E flat K495: II. Romanc...,3949,0.35366
7116,Secrets - OneRepublic,3916,0.350705
8846,Tive Sim - Cartola,3185,0.285239
2718,Fireflies - Charttraxx Karaoke,3171,0.283985
3486,Hey_ Soul Sister - Train,3132,0.280492


In [14]:
song_df['song'].describe()

count                      1116609
unique                        9952
top       Sehr kosmisch - Harmonia
freq                          5970
Name: song, dtype: object

In [15]:
# Кількість записів, а також кількість користувачів
users = song_df['user_id'].unique()

In [16]:
len(song_df)

1116609

In [17]:
len(users)

66346

### Створення рекомендаційної системи для пісень – simple popularity-based recommender

In [18]:
train_data, test_data = train_test_split(song_df_3, test_size = 0.20, random_state=0)
train_data.head(5)

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
7389,354cfdb566f543bb5b810a4d8959d974a30797fd,SOPLUOT12A6D4F7AC3,1,Intergalactic,Beastie Boys,Intergalactic - Beastie Boys
9275,0ec9cc33028dff6209aa49bf645ef64bdcbe00fc,SOHAZRY12A8C13BC47,12,Southside,Common / Kanye West,Southside - Common / Kanye West
2995,8fce200f3912e9608e3b1463cdb9c3529aab5c08,SOIBPQQ12A6310F0F4,1,I Bloom Blaum,Coldplay,I Bloom Blaum - Coldplay
5316,7ef2a3b074b34984f3f677bddde0f1813486cc10,SOUJWJW12A63110848,1,Sound Check (Gravity),Gorillaz,Sound Check (Gravity) - Gorillaz
356,2b6c2f33bc0e887ea7c4411f58106805a1923280,SONYEOJ12A8C142E86,6,Breed,Nirvana,Breed - Nirvana


In [19]:
#Class for Popularity based Recommender System model
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None

    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)

        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])

        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')

        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):
        user_recommendations = self.popularity_recommendations

        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id

        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]

        return user_recommendations

In [20]:
popularity_m = popularity_recommender_py()
popularity_m.create(train_data, 'user_id', 'song')

In [21]:
popularity_m.recommend(song_df['user_id'][0])

Unnamed: 0,user_id,song,score,Rank
3228,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sehr kosmisch - Harmonia,44,1.0
929,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Dog Days Are Over (Radio Edit) - Florence + Th...,40,2.0
4097,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Undo - Björk,35,3.0
3222,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Secrets - OneRepublic,34,4.0
4464,b80344d063b5ccb3212f76538f3d9e43d87dca9e,You're The One - Dwight Yoakam,33,5.0
1231,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Fireflies - Charttraxx Karaoke,29,6.0
3072,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Revelry - Kings Of Leon,29,7.0
1654,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Horn Concerto No. 4 in E flat K495: II. Romanc...,28,8.0
3849,b80344d063b5ccb3212f76538f3d9e43d87dca9e,The Scientist - Coldplay,23,9.0
1002,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Drop The World - Lil Wayne / Eminem,22,10.0


In [22]:
popularity_m.recommend(song_df['user_id'][1])

Unnamed: 0,user_id,song,score,Rank
3228,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sehr kosmisch - Harmonia,44,1.0
929,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Dog Days Are Over (Radio Edit) - Florence + Th...,40,2.0
4097,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Undo - Björk,35,3.0
3222,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Secrets - OneRepublic,34,4.0
4464,b80344d063b5ccb3212f76538f3d9e43d87dca9e,You're The One - Dwight Yoakam,33,5.0
1231,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Fireflies - Charttraxx Karaoke,29,6.0
3072,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Revelry - Kings Of Leon,29,7.0
1654,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Horn Concerto No. 4 in E flat K495: II. Romanc...,28,8.0
3849,b80344d063b5ccb3212f76538f3d9e43d87dca9e,The Scientist - Coldplay,23,9.0
1002,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Drop The World - Lil Wayne / Eminem,22,10.0


### Рекомендаційна система з персоналізацією, що заснована на схожості

In [23]:
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None

    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())

        return user_items

    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())

        return item_users

    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())

        return all_items

    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):

        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))

        ###############################################
        #Initialize the item cooccurence matrix of size
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)

        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())

            for j in range(0,len(user_songs)):

                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]

                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)

                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)

                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0


        return cooccurence_matrix


    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))

        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()

        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)

        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)

        #Fill the dataframe with top 10 item based recommendations
        rank = 1
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1

        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df

    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):

        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)

        print("No. of unique songs for the user: %d" % len(user_songs))

        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()

        print("No. of unique songs in the training set: %d" % len(all_songs))

        ###############################################
        #C. Construct item cooccurence matrix of size
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)

        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)

        return df_recommendations

    #Get similar items to given items
    def get_similar_items(self, item_list):

        user_songs = item_list

        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()

        print("No. of unique songs in the training set: %d" % len(all_songs))

        ###############################################
        #C. Construct item cooccurence matrix of size
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)

        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)

        return df_recommendations

In [24]:
personalized_model = item_similarity_recommender_py()
personalized_model.create( train_data, 'user_id', 'song' )

In [25]:
cur_user_id = users[5]
user_items = personalized_model.get_user_items( cur_user_id )
personalized_model.recommend( cur_user_id )

No. of unique songs for the user: 13
No. of unique songs in the training set: 4496
Non zero values in cooccurence_matrix :959


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You Found Me (Album Version) - The Fray,0.058974,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hellbound - J-Black & Masta Ace,0.058974,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,As The World Turns - Eminem,0.05378,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm Back - Eminem,0.052015,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Armed And Ready (2009 Digital Remaster) - The ...,0.044322,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm The One Who Understands (Edit Version) - War,0.044322,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Falling - Iration,0.044322,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman - Eminem / Dina Rae,0.039777,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Seed (2.0) - The Roots / Cody Chestnutt,0.038586,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Way You Move - OutKast,0.038586,10


In [26]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

No. of unique songs in the training set: 4496
Non zero values in cooccurence_matrix :22


Unnamed: 0,user_id,song,score,rank
0,,Always Like This - Bombay Bicycle Club,0.5,1
1,,Kong - Bonobo,0.5,2
2,,I'll Try Anything Once - The Strokes,0.5,3
3,,Cotton Fields (The Cotton Song) (Digitally Rem...,0.5,4
4,,You Only Live Once - The Strokes,0.5,5
5,,The Sound of Settling (Album Version) - Death ...,0.5,6
6,,Novocaine For The Soul - Eels,0.333333,7
7,,Every Lasting Light - The Black Keys,0.333333,8
8,,This Charming Man - The Smiths,0.333333,9
9,,Holland_ 1945 - Neutral Milk Hotel,0.333333,10


In [27]:
personalized_model.get_similar_items(['Taylor Swift']).head(20)

No. of unique songs in the training set: 4496
Non zero values in cooccurence_matrix :0


Unnamed: 0,user_id,song,score,rank
0,,Joe's Head - Kings Of Leon,0.0,1
1,,Santa Monica - Theory Of A Deadman,0.0,2
2,,Rock Bottom - Eminem,0.0,3
3,,Too Much Love - LCD Soundsystem,0.0,4
4,,Side To Side (Featuring Lateef & Pigeon John) ...,0.0,5
5,,Superman (It's Not Easy) - Five for Fighting,0.0,6
6,,Hell Breaks Loose - Eminem / Dr. Dre,0.0,7
7,,Doin' The Cockroach - Modest Mouse,0.0,8
8,,Walkin' On The Sun - Smash Mouth,0.0,9
9,,Te Va A Doler - Maelo Ruiz,0.0,10
