# Investigating item-based collaborative filtering
## 1.Reimplementing the algorithm
### 1.1 Loading are preprocessing the dataset from MovieLens

In [None]:
import pandas as pd
path = "./Dataset/ml-100k/u.data"
data = pd.read_csv(path, header = None, sep='\s+')
print(data)
print(type(data))

### 1.2 Building the Item-based CF class

In [144]:
import math
class ItemBasedCF:
    def __init__(self,train_data):
        self.train_data = train_data
        self.processData()
    #process the input data to a dict
    def processData(self):
        self.train = dict()
        for i in range(len(self.train_data)):
            user, item, score, time = self.train_data.loc[i]
            self.train.setdefault(user,{})
            self.train[user][item] = int(float(score))

    # computes the item similarity matrix
    def ItemSimilarity(self):
        #item-item matrix
        C = dict()
        #number of users rated the movie
        N = dict()
        for user,items in self.train.items():
            for i in items.keys():
                N.setdefault(i,0)
                N[i] += 1
                C.setdefault(i,{})
                for j in items.keys():
                    if i == j : continue
                    C[i].setdefault(j,0)
                    C[i][j] += 1
        #computes the similarity matrix
        self.W = dict()
        for i,related_items in C.items():
            self.W.setdefault(i,{})
            for j,cij in related_items.items():
                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
        return self.W

    # make prediction of the rating for an item by the active user
    def Predict(self,user,movie,K=3):
        if(self.train[user].get(movie)):
            return "error"
        rank = dict()
        action_item = self.train[user]   #the movies that the active users watched
        for item,score in action_item.items():
            #j and wj 
            for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
                if j in action_item.keys():
                    continue
                rank.setdefault(j,0)
                rank[j] += score * wj
        return rank.get(movie)


    # make recommendation top N movies for a user based on K the number of neighbors
    def Recommend(self,user,K=3,N=10):
        rank = dict()
        action_item = self.train[user]   #the movies that the active users watched
        for item,score in action_item.items():
            #j and wj 
            for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
                if j in action_item.keys():
                    continue
                rank.setdefault(j,0)
                rank[j] += score * wj
        return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])

model = ItemBasedCF(data)
sim = model.ItemSimilarity()
model.Recommend(100)

{748: 3.237503147891443,
 322: 2.322385613022473,
 246: 1.8895498093391012,
 896: 1.7679199997570947,
 902: 1.6386731552203917,
 329: 1.635259635065354,
 680: 1.6233588998268556,
 903: 1.6002645283972774,
 1433: 1.414213562373095,
 1429: 1.2247448713915892}

## 2. Performance Measurements

In [None]:
train_data = data[:80000]
test_model = ItemBasedCF(train_data)
test_sim = test_model.ItemSimilarity()

test_data = data[80000:]
test_user = test_data[0]
test_item = test_data[1]
test_score = test_data[2]
print(test_user)



In [141]:
count = 0
mae = 0
print(test_data)
for i in range(80000, 100000):
    prediction = test_model.Predict(test_user[i], test_item[i])
    if(prediction == None):
        count += 1
    else:
        mae += abs(prediction - test_score[i])
mae = mae / (len(test_user) - count)

         0     1  2          3
80000  863  1431  4  889289618
80001  761  1287  1  876190072
80002  863   322  1  889289327
80003  828   694  2  891036717
80004  889   523  4  880178078
...    ...   ... ..        ...
99995  880   476  3  880175444
99996  716   204  5  879795543
99997  276  1090  1  874795795
99998   13   225  2  882399156
99999   12   203  3  879959583

[20000 rows x 4 columns]


In [143]:
print(count)
print(mae)

11518
2.8636488020448856
