In [1]:
import math
import requests
import pandas as pd
import collections
def get_path(movie_id):
    url = "https://api.themoviedb.org/3/movie/{}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US".format(movie_id)
    data = requests.get(url)
    data = data.json()
    poster_path = data['poster_path']
    if poster_path:
        full_path = "https://image.tmdb.org/t/p/w500/" + poster_path
    else:
        full_path = "-1"
    return full_path


def load_dataset(file_name, k_to_test):
    file_name = "archive/ratings.csv"
    df = pd.read_csv(file_name)
    df = df.head(26024289) # 26024289  5000000
    user_list = list(df["userId"])
    item_list = list(df["movieId"])
    n = len(user_list)
    print("len(set(item_list)),len(df),len(set(user_list))",len(set(item_list)),len(df),len(set(user_list)))


    # step 2: pre process
    # item2user_set = {()}
    # user2item_set = {()}
    # item_2_item_sim = {}
    item_2_user_set = collections.defaultdict(set)
    user_2_item_set = collections.defaultdict(set)

    # sample
    sample_rate = 10000 #10
    rating_cnt = 0
    total_user_set = set()
    total_item_set = set()

    for i in range(n):
        user = user_list[i]
        item = item_list[i]
        if len(user_2_item_set[user]) > sample_rate:
            continue
        item_2_user_set[item].add(user)
        user_2_item_set[user].add(item)
        rating_cnt += 1
        total_user_set.add(user)
        total_item_set.add(item)
    print("len(set(item_list)),len(df),len(set(user_list))",len(total_item_set),rating_cnt,len(total_user_set))


    # long tail distribution for drawing 
    cnt2cnt = collections.defaultdict(int)
    for user, item_set in user_2_item_set.items():
        cnt2cnt[len(item_set)] += 1
    tmp_item = []
    for cnt,user_cnt in cnt2cnt.items():
        tmp_item.append([cnt,user_cnt])
    tmp_item.sort()
    print(tmp_item)


    # step 3: split train and test, last one test label
    train_user_2_item_set = collections.defaultdict(set)
    test_uer_2_item_set = collections.defaultdict(set)
    train_item_2_user_set = collections.defaultdict(set)
    test_item_2_user_set = collections.defaultdict(set)
    for user,item_set in user_2_item_set.items():
        tmp_list = list(item_set)
        train_user_2_item_set[user] = set(tmp_list[:-k_to_test])
        test_uer_2_item_set[user] = set(tmp_list[-k_to_test:])

        for item in tmp_list[:-k_to_test]:
            train_item_2_user_set[item].add(user)
        for item in tmp_list[-k_to_test:]:
            test_item_2_user_set[item].add(user)

    return train_item_2_user_set, test_item_2_user_set, test_uer_2_item_set,train_user_2_item_set


def get_item_sim(item_2_user_set):
    item_2_item_sim = collections.defaultdict(list)
    i = 0
    for item1,user_set1 in item_2_user_set.items():
        i += 1
        if i % 100 == 0:
            print("Training ====",i)
        for item2, user_set2 in item_2_user_set.items():
            if item1 == item2:
                continue
            n1 = len(user_set1)
            n2 = len(user_set2)
            if n1 * n2 <= 0:
                continue
            m = len(user_set1 & user_set2)
            score = m / math.sqrt(n1 * n2)
            item_2_item_sim[item1].append([score,item2])
    return item_2_item_sim

def predict_itemcf(train_user_2_item_set, user, item_2_item_sim, top_n ):
    score = 1
    behavior = train_user_2_item_set[user]
    item2score = collections.defaultdict(float)
    for i in behavior:
        for j_score,j_item in item_2_item_sim[i]:
            if j_item not in behavior: # can not recommend movie which ever been seen
                item2score[j_item] += j_score
    item2score_list = [[value,key] for key,value in item2score.items()]
    item2score_list.sort()
    #print(user, item2score_list)
    item2score_list = item2score_list[-top_n:][::-1]

    return item2score_list



def get_user_sim(user_2_item_set):
    user_2_user_sim = collections.defaultdict(list)
    i = 0
    for user1, item_set1 in user_2_item_set.items():
        i += 1
        if i % 100 == 0:
            print("Training ====",i)
        for user2, item_set2 in user_2_item_set.items():
            if user1 == user2:
                continue
            n1 = len(item_set1)
            n2 = len(item_set2)
            if n1 * n2 <= 0:
                continue
            m = len(item_set1 & item_set2)
            score = m / math.sqrt(n1 * n2)
            user_2_user_sim[user1].append([score,user2])
    return user_2_user_sim



def predict_usercf(train_user_2_item_set, user, user_2_user_sim, top_n ):
    score = 1
    neighbor_user_sim = user_2_user_sim[user]
    item2score_sum = collections.defaultdict(float)
    item2score_cnt = collections.defaultdict(int)
    item2score_score = collections.defaultdict(float)

    for u_score, nei in neighbor_user_sim:
        for item in train_user_2_item_set[nei]:
            item2score_sum[item] += u_score
            item2score_cnt[item] += 1
    for item, score_sum in item2score_sum.items():
        item2score_score[item] = score_sum / item2score_cnt[item]

    item2score_list = [[value,key] for key,value in item2score_sum.items()]
    #item2score_list = [[value,key] for key,value in item2score_score.items()]
    item2score_list.sort()
    #print(user, item2score_list)
    item2score_list = item2score_list[-top_n:][::-1]

    return item2score_list


def user_cf():
    k = 3
    #file_name = "archive/ratings_small.csv"
    file_name = "archive/ratings.csv"
    train_item_2_user_set, test_item_2_user_set, test_uer_2_item_set, train_user_2_item_set = load_dataset(file_name, k)

    # 1.Train Model
    # (1) get user similarity
    user_2_user_sim = get_user_sim(train_user_2_item_set)
    #print(user_2_user_sim)

    # predict based on user CF
    top_n_list = [1,3,5,8,10,20]
    reall_zi = [0 for i in range(len(top_n_list))]
    reall_mu = [0 for i in range(len(top_n_list))]
    precision_mu = [0 for i in range(len(top_n_list))]
    cnt = 0
    for user,test_item_set in test_uer_2_item_set.items():
        cnt += 1
        if cnt % 1000 == 0:
            print(cnt)
        score2item = predict_usercf(train_user_2_item_set, user, user_2_user_sim, 20) # get Top20
        for i in range(len(top_n_list)):
            n = top_n_list[i]
            # compute recall for all top_n
            #print(score2item,"score2item")
            for score,item in score2item[:n]:
                if item in test_item_set:
                    reall_zi[i] += 1
            precision_mu[i] += len(score2item[:n])
            reall_mu[i] += len(test_item_set) # k item for 
    for i in range(len(top_n_list)):
        print("n,recall,prescsion",top_n_list[i],reall_zi[i] / reall_mu[i], reall_zi[i] / precision_mu[i] )



def item_cf():
    k = 3 #check top 3 result
    # file_name = "archive/ratings_small.csv"
    file_name = "archive/ratings.csv"
    # step1 load data and get train and test
    train_item_2_user_set, test_item_2_user_set, test_uer_2_item_set, train_user_2_item_set = load_dataset(file_name, k)
    

    # step 2: train model
    # (1) item similarity
    item_2_item_sim = get_item_sim(train_item_2_user_set)
    #print(train_user_2_item_set, )

    # (2) predict
    top_n_list = [1,3,5,8,10,20]
    reall_zi = [0 for i in range(len(top_n_list))]
    reall_mu = [0 for i in range(len(top_n_list))]
    precision_mu = [0 for i in range(len(top_n_list))]
    cnt = 0
    for user,test_item_set in test_uer_2_item_set.items():
        cnt += 1
        if cnt % 1000 == 0:
            print(cnt)
        score2item = predict_itemcf(train_user_2_item_set, user, item_2_item_sim, 20) # get Top20

        for i in range(len(top_n_list)):

            n = top_n_list[i]
            # compute recall for all top_n
            #print(score2item,"score2item")
            for score,item in score2item[:n]:
                if item in test_item_set:
                    reall_zi[i] += 1

            precision_mu[i] += len(score2item[:n])
            reall_mu[i] += len(test_item_set) # k item for 
    for i in range(len(top_n_list)):
        print("n,recall,prescsion",top_n_list[i],reall_zi[i] / reall_mu[i], reall_zi[i] / precision_mu[i] )
    #print()

def draw_picture():
    x = [1,3,5,8,10,20]
    y1 = [0.019662921348314606,0.047752808988764044,0.05898876404494382,0.09269662921348315,0.10112359550561797,0.15168539325842698]
    y2 = [0.0,0.0028089887640449437,0.011235955056179775,0.033707865168539325,0.03651685393258427,0.06741573033707865]
    plt.plot(x,y1, label='Item-CF', marker= 'o') 
    #plt.plot(x,y2, label='User-CF', marker= '+') 
    plt.xlabel('Top-N Items')
    plt.ylabel('Recall')
    plt.legend()
    plt.show()

In [4]:
k = 3
file_name = "archive/ratings.csv"
    # step1 load data and get train and test
train_item_2_user_set, test_item_2_user_set, test_uer_2_item_set, train_user_2_item_set = load_dataset(file_name, k)

len(set(item_list)),len(df),len(set(user_list)) 45115 26024289 270896
len(set(item_list)),len(df),len(set(user_list)) 44268 26016014 270896
[[1, 5084], [2, 3609], [3, 3247], [4, 2849], [5, 7324], [6, 5383], [7, 4003], [8, 3008], [9, 2486], [10, 6795], [11, 4412], [12, 3649], [13, 3157], [14, 2868], [15, 19289], [16, 10105], [17, 7012], [18, 5581], [19, 4591], [20, 4947], [21, 3599], [22, 3259], [23, 3055], [24, 2788], [25, 2816], [26, 2558], [27, 2385], [28, 2373], [29, 2302], [30, 2270], [31, 2088], [32, 2053], [33, 1941], [34, 1842], [35, 1890], [36, 1688], [37, 1708], [38, 1618], [39, 1571], [40, 1693], [41, 1546], [42, 1450], [43, 1378], [44, 1382], [45, 1314], [46, 1311], [47, 1315], [48, 1238], [49, 1227], [50, 1460], [51, 1267], [52, 1201], [53, 1167], [54, 1163], [55, 1202], [56, 1090], [57, 1051], [58, 970], [59, 1079], [60, 1052], [61, 990], [62, 938], [63, 939], [64, 872], [65, 883], [66, 831], [67, 856], [68, 841], [69, 820], [70, 852], [71, 785], [72, 827], [73, 730], [74,

NameError: name 'item_2_item_sim' is not defined

In [5]:
item_2_item_sim = get_item_sim(train_item_2_user_set)

Training ==== 100
Training ==== 200
Training ==== 300
Training ==== 400
Training ==== 500
Training ==== 600
Training ==== 700
Training ==== 800
Training ==== 900
Training ==== 1000
Training ==== 1100
Training ==== 1200


KeyboardInterrupt: 

In [18]:
def processed(itemlist):
    r = ''
    for i in range(len(itemlist)):
        r = r + a[i][1]+' '
    r = r[0:-1]
    return r

In [None]:
import asyncio
import websockets
import nest_asyncio
nest_asyncio.apply()

# __import__('IPython').embed()

async def echo(websocket):
    async for message in websocket:
        data = websocket.recv()
        print(data)
        score2item = predict_itemcf(train_user_2_item_set, data, item_2_item_sim, 10)
        result = processed(socre2item)
        websocket.send(result)

async def main():
    async with websockets.serve(echo, "localhost", 8766):
        await asyncio.Future()  # run forever

asyncio.run(main())