In [1]:
import random
import math
from operator import itemgetter

In [16]:
class UserBasedCF():
    def __init__(self,num1=50,num2=10):
        # recommend num2 movies based on num1 similar users
        self.n_sim_user = num1
        self.n_rec_movie = num2
        # data selection
        self.trainSet = {}
        self.testSet={}
        # similarity matrix
        self.user_sim_matrix = {}
        
        
    def get_dataset(self, filename, pivot=0.8):
        trainSet_count = 0
        testSet_count = 0
        random.seed()
        
        with open(filename) as f:
            for index,line in enumerate(f):
                if index==0: 
                    continue
                    '''
                if index>=262:
                    break
                    '''
                user, movie, rating, timestamp = line.split(',')
                if random.random() < pivot: 
                    self.trainSet.setdefault(user, {}) 
                    self.trainSet[user][movie] = rating
                    trainSet_count+=1
                else:
                    self.testSet.setdefault(user, {}) 
                    self.testSet[user][movie] = rating
                    testSet_count+=1
                    
    def calc_user_sim(self):
        #movie_user = {}
        for user_u in self.trainSet:
            #print("=============")
            for user_v in self.trainSet:
                if user_u==user_v: continue
                u_len=len(self.trainSet[user_u])
                v_len=len(self.trainSet[user_v])
                u_count=0
                v_count=0;
                for movie in self.trainSet[user_u]:
                    u_count+=float(self.trainSet[user_u][movie])*float(self.trainSet[user_u][movie])
                for movie in self.trainSet[user_v]:
                    v_count+=float(self.trainSet[user_v][movie])*float(self.trainSet[user_v][movie])   
                deno=math.sqrt(u_count)*math.sqrt(v_count)
                numer=0
                self.user_sim_matrix.setdefault(user_u, {})
                self.user_sim_matrix[user_u].setdefault(user_v, 0)
                self.user_sim_matrix[user_u][user_v] += 1
                for movie in self.trainSet[user_u]:
                    if movie in self.trainSet[user_v]:
                        numer+=float(self.trainSet[user_u][movie])*float(self.trainSet[user_v][movie])
                        #print(user_u)
                        #print(movie)
                        #print(numer)
                        
                self.user_sim_matrix[user_u][user_v]=numer/deno
    def predict(self, user,movie):
        
        K = self.n_sim_user     
        N = self.n_rec_movie   
        watched_movies = self.trainSet[user]
        if movie in watched_movies: return -1 #avoid recommend watched movies
        deno=0
        prediction=0
        count=0
        index=0
        rank=sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)
        
        while count<K and index<len(rank): 
            cur_user=rank[index][0]
            if movie in self.trainSet[cur_user]:
                similarity=self.user_sim_matrix[user][cur_user]
                deno+=float(similarity)
                prediction+=float(similarity) * float(self.trainSet[cur_user][movie])
                count+=1
            index+=1
            
        # if there's no similar users who already rated the movie, then we return the average rating of this user
        if deno==0: 
            ave=0
            for each_movie in watched_movies:
                ave+=float(watched_movies[each_movie])
            return ave/len(watched_movies)
        return prediction/deno  

    def predict_value_mae(self,user):
        mae=0
        if len(self.testSet.get(user,{}))==0: return 0
        for movie in self.testSet[user]:
            mae+=abs(self.predict(user,movie)-float(self.testSet[user][movie]))
        return mae/len(self.testSet[user])
      
        
        
    def predict_value_rmse(self,user):
        rmse=0
        if len(self.testSet.get(user,{}))==0: return 0
        for movie in self.testSet[user]:
            rmse+=pow((self.predict(user,movie)-float(self.testSet[user][movie])),2)
        return math.sqrt(rmse/len(self.testSet[user]))
    
    
    def mae_all_user(self):
        mae=0
        count=0
        for user_u in self.trainSet:
            each_mae=self.predict_value_mae(user_u)
            mae+=each_mae
            count+=1
        return mae/count
    def rmse_all_user(self):
        rmse=0
        count=0
        for user_u in self.trainSet:
            each_rmse=self.predict_value_rmse(user_u)
            rmse+=each_rmse
            count+=1
        return rmse/count
    
    def recommend(self,user):
        K = self.n_sim_user    
        N = self.n_rec_movie    
        rank = {}
        watched_movies = self.trainSet[user]
        possible_movies=set()
       
        for similar_user, similarity in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]: 
            for movie, rating in self.trainSet[similar_user].items(): 
 
                if movie in watched_movies: 
                    continue
                else :
                    if float(rating)>3:
                        possible_movies.add(movie)
        for movie in possible_movies:
            score=self.predict(user,movie)
            rank.setdefault(movie, 0)
            rank[movie]=score
                      
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]

    
    def evaluate(self):
        hit = 0 
        rec_count = 0
        test_count = 0
        all_rec_movies = set()
        ndcg=0
        all_good_movies=0
        
        for user_u in self.trainSet:
            ave=0 #average rating of user_u
            rate_sum=0 #the sum of rating of user_u
            print(user_u)
            dcg=0
            idcg=0
            cur_hit=0
            test_movies=self.testSet.get(user_u, {})
            for movie in test_movies:
                rate_sum+=float(test_movies[movie])
            if rate_sum==0: ave=3
            else: ave=rate_sum/len(test_movies)
            for movie in test_movies:
                if float(test_movies[movie])>=ave:
                    all_good_movies+=1
            rec_movies=self.recommend(user_u)
            #for movie, score in rec_movies: 
            for i in range(len(rec_movies)):
                movie=rec_movies[i][0]
                
                if movie in test_movies and float(test_movies[movie])>=ave:
                    hit += 1
                    cur_hit+=1
                    idcg+=1/(math.log(2,1+cur_hit))
                    dcg+=1/(math.log(2,1+i+1))
                all_rec_movies.add(movie)   
            rec_count +=self.n_rec_movie
            test_count += len(test_movies)
            if(cur_hit==0): ndcg+=0 
            else: ndcg+=dcg/idcg
        
        '''
        user_u='1'
        
        test_movies=self.testSet.get(user_u, {})
        rec_movies=self.recommend(user_u)
        for movie, score in rec_movies: 
            if movie in test_movies:
                hit += 1
            all_rec_movies.add(movie)   
        rec_count +=self.n_rec_movie
        test_count += len(test_movies)
        '''
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * all_good_movies)
        f1 = 2*(precision*recall) / (precision+recall)   
        print('precisioin=%.4f\trecall=%.4f\tf1=%.4f\tndcg=%.4f' % (precision, recall, f1,ndcg/610))


In [17]:
M1=UserBasedCF()
M1.get_dataset("ratings.csv")

In [18]:
M1.calc_user_sim()

In [5]:
ave_mae=M1.mae_all_user()
ave_rmse=M1.rmse_all_user()
print(ave_mae)
print(ave_rmse)
rec=M1.recommend('1')

0.7818948051115773
0.9500538742728016


[('6192', 5.000000000000001),
 ('876', 5.0),
 ('4298', 5.0),
 ('4180', 5.0),
 ('633', 5.0),
 ('25947', 5.0),
 ('5537', 5.0),
 ('26326', 5.0),
 ('3678', 5.0),
 ('3951', 4.999999999999999)]

In [7]:
rec=M1.recommend('1')

In [19]:
M1.evaluate()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
