In [2]:
import os
import math
import random
import pandas as pd

In [9]:
class ItemBasedCF:
    def __init__(self, path):
        self.train = {}
        self.test = {}
        self.generate_dataset(path)

    def load_file(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                yield line.strip()

    def generate_dataset(self, path, pivot=0.7):
        i = 0
        for line in self.load_file(path):
            user, movie, rating, _ = line.split('::')
            if i <= 10:
                print(' {}, {}, {}, {}'.format(user, movie, rating, _))
            i += 1
            if random.random() < pivot:
                self.train.setdefault(user, {})
                self.train[user][movie] = int(rating)
            else:
                self.test.setdefault(user, {})
                self.test[user][movie] = int(rating)

    def item_similarity(self):
        C = dict()
        N = dict()
        for user, items in self.train.items():
            for i in items.keys():
                N.setdefault(i, 0)
                N[i] += 1
                C.setdefault(i, {})
                for j in items.keys():
                    if i == j:
                        continue
                    C[i].setdefault(j, 0)
                    C[i][j] += 1
        self.W = dict()
        for i, related_items in C.items():
            self.W.setdefault(i, {})
            for j, cij in related_items.items():
                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
        return self.W, C, N

    def recommend(self, u, k=3, n=10):
        rank = dict()
        action_item = self.train[u]
        for i, score in action_item.items():
            for j, wj in sorted(self.W[i].items(), key=lambda x: x[1], reverse=True)[:k]:
                if j in action_item.keys():
                    continue
                rank.setdefault(j, 0)
                rank[j] += score * wj
        return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[:n])

    def recall_precision(self, k=8, nitem=10):
        hit = 0
        recall = 0
        precision = 0
        for user, items in self.test.items():
            rank = self.recommend(user, k=k, n=nitem)
            hit += len(set(rank.keys()) & set(items.keys()))
            recall += len(items)
            precision += nitem
        return hit / recall, hit / precision

In [4]:
def print_2_dim_dic(dic, n=3):
    n = 0
    for u, v_cnt in dic.items():
        if n >= 3:
            break
        n += 1
        m = 1
        for v, cnt in v_cnt.items():
            if m >= 3:
                break
            m += 1
            print(u, v, cnt)

In [5]:
def print_1_dim_dic(dic, n=3):
    n = 0
    for u,i_cnt in dic.items():
        if n >= 3:
            break
        n += 1    
        print(u, i_cnt)

In [6]:
def sort_2_dim_dic(dic, k, n=5):
    return sorted(dic[k].items(), key=lambda x:x[1],reverse=True)[:n]

def sort_1_dim_dic(dic, n=5):
    return sorted(dic.items(), key=lambda x:x[1],reverse=True)[:n]

def trans_dic_2_dim(dic):
     return pd.DataFrame(dic).T.fillna(0)

In [11]:
path = os.path.join(r"C:\Users\cmj\Documents\Codes\Dataset\ml-1m", "ratings.dat")
icf = ItemBasedCF(path)

 1, 1193, 5, 978300760
 1, 661, 3, 978302109
 1, 914, 3, 978301968
 1, 3408, 4, 978300275
 1, 2355, 5, 978824291
 1, 1197, 3, 978302268
 1, 1287, 5, 978302039
 1, 2804, 5, 978300719
 1, 594, 4, 978302268
 1, 919, 4, 978301368
 1, 595, 5, 978824268


In [12]:
i_W, i_C, i_N = icf.item_similarity()

In [13]:
df_ic = trans_dic_2_dim(i_C)

In [14]:
df_ic.iloc[:10,:10]

Unnamed: 0,661,914,3408,2355,1197,1287,2804,594,919,938
1193,92.0,157.0,219.0,275.0,386.0,181.0,309.0,169.0,426.0,40.0
661,0.0,61.0,74.0,158.0,145.0,44.0,104.0,134.0,152.0,30.0
914,61.0,0.0,89.0,140.0,210.0,110.0,133.0,168.0,241.0,72.0
3408,74.0,89.0,0.0,265.0,269.0,104.0,192.0,106.0,211.0,28.0
2355,158.0,140.0,265.0,0.0,432.0,155.0,276.0,228.0,335.0,35.0
1197,145.0,210.0,269.0,432.0,0.0,236.0,444.0,252.0,515.0,57.0
1287,44.0,110.0,104.0,155.0,236.0,0.0,161.0,128.0,232.0,34.0
2804,104.0,133.0,192.0,276.0,444.0,161.0,0.0,178.0,380.0,37.0
594,134.0,168.0,106.0,228.0,252.0,128.0,178.0,0.0,311.0,62.0
919,152.0,241.0,211.0,335.0,515.0,232.0,380.0,311.0,0.0,74.0


In [15]:
df_ic.shape

(3657, 3657)

In [16]:
print_1_dim_dic(i_N)

1193 1203
661 358
914 446


In [17]:
df_iw = trans_dic_2_dim(i_W)

In [18]:
df_iw.shape

(3657, 3657)

In [19]:
df_iw.iloc[:10, :10]

Unnamed: 0,661,914,3408,2355,1197,1287,2804,594,919,938
1193,0.140189,0.214338,0.211887,0.230422,0.278224,0.231306,0.287985,0.208908,0.350921,0.101935
661,0.0,0.152658,0.131245,0.242683,0.191587,0.103075,0.177679,0.303643,0.229527,0.140144
914,0.152658,0.0,0.141422,0.192657,0.248595,0.230869,0.203577,0.341069,0.326048,0.301342
3408,0.131245,0.141422,0.0,0.258442,0.225676,0.154692,0.208276,0.152511,0.202306,0.083051
2355,0.242683,0.192657,0.258442,0.0,0.313869,0.199663,0.259285,0.284092,0.278164,0.089906
1197,0.191587,0.248595,0.225676,0.313869,0.0,0.261513,0.358812,0.27011,0.367857,0.125953
1287,0.103075,0.230869,0.154692,0.199663,0.261513,0.0,0.230681,0.243249,0.293806,0.133203
2804,0.177679,0.203577,0.208276,0.259285,0.358812,0.230681,0.0,0.246697,0.350962,0.105716
594,0.303643,0.341069,0.152511,0.284092,0.27011,0.243249,0.246697,0.0,0.380972,0.234956
919,0.229527,0.326048,0.202306,0.278164,0.367857,0.293806,0.350962,0.380972,0.0,0.186878


In [20]:
recomend = icf.recommend('520')

In [21]:
recomend

{'50': 13.613752267730005,
 '594': 12.291936632211,
 '1196': 10.384611751964847,
 '1265': 9.436108021147595,
 '2396': 8.76532089936368,
 '595': 8.372550660618792,
 '3421': 7.846968326901465,
 '260': 6.402299652221059,
 '914': 6.166774460148543,
 '1252': 5.852933977220801}

In [None]:
icf.recall_precision()