### Evaluation
- You are given a recommendations.csv file with user ids and you are supposed to recommend atmost 10 songs.
- The training data is provided in train.csv file.
- Your entries in the recommendations.csv file should be such that for each row the first value must be user_id followed by recommended song_ids all seperated by comma.
- Make sure you have atleast one recommendation for each user in recommendations.csv or else your score will be zero
- The recommended songs to a user must be different from what you already have in the training set for the same user.
- A user can have at most 10 recommendations.
- The final score F1 will be a harmonic mean of precision and recall values.
- Run the below script to score your recommendations.

In [46]:
import dill
import csv
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler

In [12]:
song_df=pd.read_csv('train.csv')
df_test = pd.read_csv('recommendations.csv')
df


Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,806ccae96c8ecb1c198482aff785ccd6bbe17143,SOBOAFP12A8C131F36,1,Lucky (Album Version),We Sing. We Dance. We Steal Things.,Jason Mraz & Colbie Caillat,0
1,ed3664f9cd689031fe4d0ed6c66503bdc3ad7cb6,SOPTLQL12AB018D56F,1,Billionaire [feat. Bruno Mars] (Explicit Albu...,Billionaire [feat. Bruno Mars],Travie McCoy,0
2,0dd93f61fe69f292ac336715ef607214efb3dbaa,SORALYQ12A8151BA99,3,If I Ain't Got You,R&B Love Collection 08,Alicia Keys,2003
3,f10e613636ee8e1f4d3a7f2b21ca9cd36d2e9d8d,SOLMXQE12A8AE46E32,2,Anyone Else But You,Juno - Music From The Motion Picture,The Moldy Peaches,2001
4,adbd40c4f3fe78e7f49f491ea04859ea07de1ed1,SOSLLGM12A6D4F6F2A,2,Medusa,Quicksand Memory EP,Ulrich Schnauss,2007
...,...,...,...,...,...,...,...
15995,2c4c75973ea3c0f2f2443ad03f0b89b3af922274,SOVGPZH12A67ADA08A,4,Clumsy,The Dutchess,Fergie,2008
15996,681f1c40ec8538eb4d5d3c95b4aff1697bf6cda5,SOKWOBO12A8C1424F6,1,No Excuses,No Way Down,Air France,2008
15997,12a4a991c8b53cd6906995caed8b1f2bd3b6436a,SOCBJHG12AB018221C,4,I Was Meant For the Stage,Her Majesty The Decemberists,The Decemberists,2003
15998,baf47ed8da24d607e50d8684cde78b923538640f,SOWYRZV12AF72A23E6,1,You Don't Know Me (featuring Regina Spektor),Way To Normal,Ben Folds,2008


In [16]:
song_grouped = song_df.groupby(['song_id']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song_id'], ascending = [0,1])

Unnamed: 0,song_id,listen_count,percentage
1456,SOFRQTD12A81C233C0,68,0.42500
230,SOAXGDH12A8C13F8A1,58,0.36250
204,SOAUWYT12A81C206F1,56,0.35000
3631,SONYKOW12AB01849C9,52,0.32500
4897,SOSXLTC12AF72A7F54,51,0.31875
...,...,...,...
6590,SOZYUXF12A6701F7B9,1,0.00625
6593,SOZZLZN12A8AE48D6D,1,0.00625
6594,SOZZRHE12A6702165F,1,0.00625
6596,SOZZTNF12A8C139916,1,0.00625


In [18]:
users = song_df['user_id'].unique()
print(len(users)) ## return 365 unique users
songs = song_df['song_id'].unique()
len(songs) ## return 5151 unique songs

754


6599

In [20]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)

In [48]:
import numpy as np
import pandas

#Class for Popularity based Recommender System model
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
    

#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations

In [49]:
pm = popularity_recommender_py()
pm.create(train_data, 'user_id', 'song_id')
#user the popularity model to make some prediction
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,song_id,score,Rank
1305,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOFRQTD12A81C233C0,56,1.0
210,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOAXGDH12A8C13F8A1,51,2.0
4392,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOSXLTC12AF72A7F54,43,3.0
185,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOAUWYT12A81C206F1,41,4.0
973,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOEGIYH12A6D4FC0E3,39,5.0
378,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOBONKR12A58A7A7E0,37,6.0
3265,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SONYKOW12AB01849C9,37,7.0
792,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SODJWHY12A8C142CCE,32,8.0
2608,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOLFXKT12AB017E3E0,29,9.0
4624,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOTWNDJ12A8C143984,28,10.0


In [34]:
is_model = item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song_id')

In [35]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: b009d2161ab50a122f8c81adb7ce0d40e8e42f2d:
------------------------------------------------------------------------------------
SODOWUC12AC9097E76
SORNJQU12AB01821FA
SOMPKDT12AAA8C6759
SOTGWWO12A8C13E806
SOYGKHG12AB01804C8
SOFKFXC12AC90732A5
SOHTCVN12A8C134E46
SOUITQM12AB01894C0
SOYABSZ12A81C1FEAC
SOVHKJL12AB017E2B2
SODULEM12A6D4F8916
SOAZDYK12AB01816E5
SOUGCDK12AC95F075F
SOACPBY12A8C13FEF9
SOOMGGT12AB01810FB
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 15
no. of unique songs in the training set: 5921
Non zero values in cooccurence_matrix :1693


Unnamed: 0,user_id,song,score,rank
0,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOWCBKV12AC90732A6,0.051624,1
1,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOQAHCM12A67020123,0.046667,2
2,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOJFLUT12A6D4F9273,0.046667,3
3,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOYJFDO12A8C13BAA1,0.046667,4
4,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOOIFDD12A8C13C468,0.046667,5
5,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOMTZYU12AB017D632,0.046667,6
6,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOCOOQD12A6D4FAC53,0.046667,7
7,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOPQLJJ12A8AE47E2D,0.046667,8
8,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOYASKG12AF72A32E0,0.046667,9
9,b009d2161ab50a122f8c81adb7ce0d40e8e42f2d,SOYHHCO12AB01891DA,0.046667,10


In [50]:
Evaluate = dill.load(open("Evaluate.pik", 'rb'))
evaluate =  dill.load(open("eval.pik", 'rb'))
print("F1 score: {}%".format(evaluate.score('recommendations.csv')))

SystemError: unknown opcode