In [29]:
import numpy as np
import pandas as pd

from sklearn import metrics

data_dir = '/Users/sam/All-Program/App-Dataset/Study/IIT-Code/Data-Mining/ml-100k/'

In [30]:
# Get the User Data
user_cols = ['user_id',
             'age',
             'gender',
             'occupation',
             'zip_code']
users = pd.read_csv(data_dir+'u.user',
                    sep='|',
                    names=user_cols)
print (users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [31]:
# Get the Item Data
item_cols = ['movie id',
             'movie title',
             'release date',
             'video release date',
             'IMDb URL',
             'Unknown',
             'Action',
             'Adventure',
             'Animation',
             'Childrens',
             'Comedy',
             'Crime',
             'Documentary',
             'Drama',
             'Fantasy',
             'FilmNoir',
             'Horror',
             'Musical',
             'Mystery',
             'Romance',
             'SciFi',
             'Thriller',
             'War',
             'Western']
items = pd.read_csv(data_dir+'u.item',
                    sep='|',
                    names=item_cols,
                    encoding='latin-1')
print (items.shape)
items.head()
item_profile = items.iloc[:,5:items.shape[1]]
item_profile.head()

(1682, 24)


Unnamed: 0,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [32]:
# Fetch the Rating Data and Create the utility matrix
rating_cols = ['user_id',
               'movie_id',
               'rating',
               'timestamp']
ratings = pd.read_csv(data_dir+'u.data',
                      sep='\t',
                      names=rating_cols)

utility = ratings.pivot(index='user_id',
                        columns='movie_id',
                        values='rating')
utility.head()
#ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [33]:
user_200 = users.loc[users['user_id'] == 200]
user_15 = users.loc[users['user_id'] == 15]
movie_95 = items.loc[items['movie id'] == 95]
movie_95 = movie_95.iloc[:,5:items.shape[1]]

movie_95



Unnamed: 0,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
94,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [34]:
# Creating a User Profile for User 200 and 15
# Why is creating a user profile needed? In order to recommend movies to users A user profile 
# is needed. Our goal in creating a user profile is to make the user_profile matrix structure
# similar to the item profile matrix structure. The similarity in structure is needed so that a
# similarity measure (such as Jaccard or Cosine) could be implemented to find similarity between
# users and movies and hence proper recommenfation could be found.

# Check what ratings are provided by the user 200 and 15 to the movie 95

# create a Dummy User profile with same features as item_profile
user_cols =    ['Unknown',
                'Action',
                'Adventure',
                'Animation',
                'Childrens',
                'Comedy',
                'Crime',
                'Documentary',
                'Drama',
                'Fantasy',
                'FilmNoir',
                'Horror',
                'Musical',
                'Mystery',
                'Romance',
                'SciFi',
                'Thriller',
                'War',
                'Western']

user_profile = pd.DataFrame(data=np.nan, index=[15,200], columns=user_cols)
#print (user_profile)
'''
    Normally there are three cases
    We are considering a scenario where item_profile has actors as features and utility matrix has
    the user and his/her corresponding movie (rating or boolean)
    Case 1: When both, the item_profile and Utility matrix is binary. .In that case the component
            feature(actor) of a user_profile would be the percentage of feature(actor) active(ON) 
            for every movie that the user has seen
    Case 2: When the item_profile had boolean values but the Utility matrix has ratings. In that 
            case the noramalized(using rating-avg_rating) utility value is used to weight the 
            vectors representing the profiles of items.
    Case 3: When both the item_profile and utility matrix have ratings data.
'''


# Step 1: Get the value of User mean rating 
user_rat_means = utility.mean(axis=1)
user_rat_means.head()
#print ('The size of Utility mean matrix is: ', user_rat_means.shape)

# Step 2: Since the Utility matrix is a rating matrix and the item profile is binary.
# We use the Utility value to weight the vectors representing the profiles of items.
# But we also mormalize the Utility matrix by the average user rating
utility_centered = utility - user_rat_means
# Since a lot of values in the utility matrix is NaN, we replace them by 0 
# which means they dont add any weight to the item profile
utility_centered = utility_centered.where((pd.notnull(utility_centered)),0)
utility_centered.head()
# Now we gather the weighted rating of user 200 and 15
w_mov_rat_user200 = np.array(utility_centered[199:200], dtype=float).flatten()
w_mov_rat_user15 = np.array(utility_centered[14:15], dtype=float).flatten()


# Step 3: Creating a user profile (constitutes of user_profile values 
# corresponding to each feature in the item_profile)
# Now the vector w_mov_rat_user200 has weighted ratings for each movie and 
for each_feature in user_cols:
    #print (each_feature)
    if each_feature:# == 'Adventure':
        feature_array = np.array(item_profile[each_feature], dtype = float)
        a = np.array([(i*j) for i,j in zip(w_mov_rat_user15,feature_array)])
        len_a = len(np.where(a != 0)[0])
        usr15_feature_wght = sum(a)/len_a
        
        b = np.array([(i*j) for i,j in zip(w_mov_rat_user200,feature_array)])
        len_b = len(np.where(b != 0)[0])
        usr200_feature_wght = sum(b)/len_b
        
#         print (usr_feature_wght)
        user_profile.loc[15][each_feature] = usr15_feature_wght
        user_profile.loc[200][each_feature] = usr200_feature_wght
#         print (len(feature_array))
#         print (len(w_mov_rat_user200))


user_profile = user_profile.where((pd.notnull(user_profile)),0)
user_profile
#utility.iloc[200,95]

Unnamed: 0,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
15,0.0,-0.830438,-0.469846,-2.76229,-2.014849,-0.922976,-1.052139,0.0,-0.50492,-0.587395,-0.216988,-3.241071,-1.420106,-0.885316,-0.070124,-0.871648,-1.246152,-0.033176,0.0
200,0.0,0.73545,0.738532,0.588382,0.40474,0.18743,0.02272,-1.787879,0.580677,0.524164,1.037406,0.848019,0.544894,0.617069,0.76599,0.895428,0.50046,0.796744,0.496197


In [35]:
# Now that we have successfully created the User_profile for User 200 and 15. We would like to 
# find to whome the item or movie_id 95 should be recommended

print (movie_95.shape)
print (user_profile.shape)
# movie_95
# user_profile

# print(metrics.jaccard_similarity_score(movie_95,user_profile))
# print(1 - metrics.jaccard_similarity_score(movie_95,user_profile))

print(metrics.pairwise.cosine_similarity(movie_95,user_profile))
print(metrics.pairwise.cosine_distances(movie_95,user_profile))

#user_profile.dot(movie_95.T)

(1, 19)
(2, 19)
[[-0.64117625  0.26708985]]
[[ 1.64117625  0.73291015]]


In [38]:
'''Assignment-4 Part 2'''


# 10 most similar users when the ratings in utility matrix are average weighted
# utility_centered.head()
arr_top_sim = np.array(metrics.pairwise.cosine_similarity(utility_centered.loc[1], utility_centered)[0], dtype=float)
arr_top_users = [val+1 for no,val in enumerate(arr_top_sim.argsort()[::-1]) if no!=0 and no<=10]
print (arr_top_users)
# Answer : 738, 592, 276, 267, 643, 757, 457, 606, 916, 44



# 10 most similar users to user 1 are:
#utility_proper = utility.where((pd.notnull(utility)),0)
#arr = np.array(metrics.pairwise.cosine_similarity(utility_proper.loc[1], utility_proper)[0], dtype=float)
#arr_top = [val+1 for no,val in enumerate(arr.argsort()[::-1]) if no!=0 and no<=10]
#print (arr_top)
# Answer : 916, 864, 268, 92, 435, 457, 738, 429, 303, 276


# print (arr)

[738, 592, 276, 267, 643, 757, 457, 606, 916, 44]




In [58]:
#utility.head()
utility_proper = utility.where((pd.notnull(utility)),0)

#print (utility_proper.columns.values)
#print (utility_proper.loc[1])

utlity_top_users = utility_proper.iloc[[i-1 for i in [1]+arr_top_users],:]  # Adding user 1
utlity_item_508 = utlity_top_users[508]
#print (utlity_top_users)
print (utlity_item_508)

rating_item_508 = np.mean(np.array(utlity_item_508)[(np.where(np.array(utlity_item_508 != 0))[0])])
print (rating_item_508)

print (arr_top_users)
print ([arr_top_sim[indices-1] for indices in arr_top_users])

sim_users = utlity_item_508.index.values
ratings_508 =  np.array(utlity_item_508)

print (sim_users)
print (ratings_508)

# Finding simplae rating (only average)
rat_avg_508 = np.mean(np.array(utlity_item_508)[np.where(ratings_508 != 0)[0]])

sum_all = sum([arr_top_sim[usr-1] for usr,rat in zip(sim_users,ratings_508) if rat!=0])

rat_weight_avg = sum([arr_top_sim[usr-1]*rat for usr,rat in zip(sim_users,ratings_508) if rat!=0])/sum_all

print (rat_weight_avg)

user_id
1      0.0
738    0.0
592    5.0
276    5.0
267    0.0
643    4.0
757    0.0
457    0.0
606    4.0
916    0.0
44     0.0
Name: 508, dtype: float64
4.5
[738, 592, 276, 267, 643, 757, 457, 606, 916, 44]
[0.29148679307800707, 0.27840172059610946, 0.26815054175880981, 0.26476146556668312, 0.26400260297782174, 0.26236784527028278, 0.26233704478060194, 0.26084701039863195, 0.25562438236025764, 0.25295440080142095]
[  1 738 592 276 267 643 757 457 606 916  44]
[ 0.  0.  5.  5.  0.  4.  0.  0.  4.  0.  0.]
1.07140187573
