In [2]:
import numpy

raw = open("ratings_small.csv", "r").readlines()[1:]
raw = [line.split(',')[:-1] for line in raw]

N = int(max(raw, key=lambda x: int(x[0]))[0])  # N: num of User
M = int(max(raw, key=lambda x: int(x[1]))[1])  # M: num of Movie
K = 5  # Num of Features

R = [[0 for _ in range(M)] for _ in range(N)]
for line in raw:
    R[int(line[0]) - 1][int(line[1]) - 1] = float(line[2])

R = numpy.array(R)

P = numpy.random.rand(N, K)
Q = numpy.random.rand(M, K)

In [3]:
def matrix_factorization(R, P, Q, K, steps=50, alpha=0.0002, beta=0.02):
    """
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter"""
    Q = Q.T

    for step in range(steps):
        print(100 * step / steps, "%")
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - numpy.dot(P[i, :], Q[:, j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = numpy.dot(P, Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e += pow(R[i][j] - numpy.dot(P[i, :], Q[:, j]), 2)

                    for k in range(K):
                        e += (beta / 2) * (pow(P[i][k], 2) + pow(Q[k][j], 2))
        # 0.001: local minimum
        if e < 0.001:
            break

    return P, Q.T


nP, nQ = matrix_factorization(R, P, Q, K)

nR = numpy.dot(nP, nQ.T)

print(nR[-1])


0.0 %
2.0 %
4.0 %
6.0 %
8.0 %
10.0 %
12.0 %
14.0 %
16.0 %
18.0 %
20.0 %
22.0 %
24.0 %
26.0 %
28.0 %
30.0 %
32.0 %
34.0 %
36.0 %
38.0 %
40.0 %
42.0 %
44.0 %
46.0 %
48.0 %
50.0 %
52.0 %
54.0 %
56.0 %
58.0 %
60.0 %
62.0 %
64.0 %
66.0 %
68.0 %
70.0 %
72.0 %
74.0 %
76.0 %
78.0 %
80.0 %
82.0 %
84.0 %
86.0 %
88.0 %
90.0 %
92.0 %
94.0 %
96.0 %
98.0 %
[3.98147004 3.39077434 3.26532486 ... 2.15604133 2.3418918  3.02668902]


# Deploiement

In [9]:
import pickle
import pandas as pd

In [10]:
with open('nR.pickle', 'wb') as handle:
    pickle.dump(nR, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open('nR.pickle', 'rb') as handle:
    nR = pickle.load(handle)
print(nR)

[[2.41536335 1.98513236 1.83456782 ... 1.5228441  1.33001473 1.40575449]
 [3.8167391  3.15191909 2.97337332 ... 1.9668331  2.10118526 2.44966591]
 [3.41600894 3.15955171 2.9085284  ... 2.01407837 2.30148015 2.76949743]
 ...
 [3.42009625 3.11330694 2.95811246 ... 1.81651396 2.14733112 2.78864921]
 [3.48869194 3.07600582 2.79473768 ... 2.11276733 2.22257448 2.46079256]
 [3.98147004 3.39077434 3.26532486 ... 2.15604133 2.3418918  3.02668902]]


In [12]:
meta = pd.read_csv('meta.csv')

In [16]:
def recommandation(user_id):
    res = [y + 1 for y in sorted(list(range(len(nR[0]))), key=lambda x: nR[user_id - 1][x], reverse=True)][:20]
    id_ = pd.DataFrame({'id':res})
    result = pd.merge(id_, meta, on="id")
    return result['original_title']
recommandation(50)

0              黃飛鴻之西域雄獅
1    The Gerson Miracle
2                PiCNiC
Name: original_title, dtype: object