## 案例：给用户推荐电影

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

In [2]:
mat = sio.loadmat('data/ex8_movies.mat')
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
Y,R = mat['Y'],mat['R']
Y.shape,R.shape

((1682, 943), (1682, 943))

In [4]:
param_mat =sio.loadmat('data/ex8_movieParams.mat')
param_mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [5]:
X,Theta,nu,nm,nf = param_mat['X'],param_mat['Theta'],param_mat['num_users'],param_mat['num_movies'],param_mat['num_features']

In [6]:
X.shape,Theta.shape,nu,nm,nf

((1682, 10),
 (943, 10),
 array([[943]], dtype=uint16),
 array([[1682]], dtype=uint16),
 array([[10]], dtype=uint8))

In [7]:
nu = int(nu)
nm = int(nm)
nf = int(nf)
nu,nm,nf

(943, 1682, 10)

## 1. 序列化参数

In [8]:
def serialize(X,Theta):
    
    return np.append(X.flatten(),Theta.flatten())

## 2. 解序列化参数

In [9]:
def deserialize(params,nm,nu,nf):
    X = params[:nm*nf].reshape(nm,nf)
    Theta = params[nm*nf:].reshape(nu,nf)
    return X,Theta

## 3. 代价函数

In [10]:
def costFunction(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nm,nu,nf)
    error = 0.5 * np.square((X@Theta.T - Y)* R).sum()
    reg1 = 0.5 * lamda * np.square(X).sum()
    reg2 = 0.5 * lamda * np.square(Theta).sum()
    return error + reg1 + reg2

In [11]:
users = 4
movies = 5
features = 3
X_sub = X[:movies,:features]
Theta_sub = Theta[:users,:features]
Y_sub = Y[:movies,:users]
R_sub = R[:movies,:users]
cost1 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda = 0)
cost1

22.224603725685675

In [12]:
cost2 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda = 1.5)
cost2

31.344056244274217

## 4.梯度

In [13]:
def costGradient(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nm,nu,nf)
    X_grad = ((X@Theta.T-Y)*R)@Theta +lamda * X
    Theta_grad = ((X@Theta.T-Y)*R).T@X + lamda * Theta
    return serialize(X_grad,Theta_grad)

## 5.添加一个新用户

In [14]:
my_ratings = np.zeros((nm,1))
my_ratings[9]   = 5
my_ratings[66]  = 5
my_ratings[96]   = 5
my_ratings[121]  = 4
my_ratings[148]  = 4
my_ratings[285]  = 3
my_ratings[490]  = 4
my_ratings[599]  = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3

In [15]:
Y = np.c_[Y,my_ratings]
R = np.c_[R,my_ratings!=0]

In [16]:
Y.shape
R.shape

(1682, 944)

In [17]:
nm,nu = Y.shape

## 6.均值归一化

In [18]:
def normalizeRatings(Y,R):
    Y_mean =(Y.sum(axis=1) / R.sum(axis=1)).reshape(-1,1)
    Y_norm = (Y - Y_mean) * R
    return Y_norm,Y_mean

In [19]:
Y_norm,Y_mean = normalizeRatings(Y,R)

## 7.参数初始化

In [20]:
X = np.random.random((nm,nf))
Theta = np.random.random((nu,nf))
params = serialize(X,Theta)
lamda = 10

## 8. 模型训练

In [21]:
from scipy.optimize import minimize
res = minimize(fun = costFunction,
        x0 = params,
        args = (Y_norm,R,nm,nu,nf,lamda),
        method = 'TNC',
        jac = costGradient,
        options = {'maxiter':100})

In [22]:
params_fit = res.x

In [23]:
fit_X,fit_Theta = deserialize(params_fit,nm,nu,nf)

## 9.预测

In [30]:
Y_pred = fit_X@fit_Theta.T

In [31]:
y_pred = Y_pred[:,-1] + Y_mean.flatten()

In [32]:
index = np.argsort(-y_pred)

In [33]:
index[:10]

array([1466, 1292, 1200, 1499, 1121, 1652,  813, 1598, 1188, 1535])

In [35]:
movies = []
with open('data/movie_ids.txt','r',encoding='latin 1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movies.append(' '.join(tokens[1:]))

In [36]:
len(movies)

1682

In [38]:
for i in range(10):
    print(index[i],movies[index[i]],y_pred[index[i]])

1466 Saint of Fort Washington, The (1993) 5.001810464524898
1292 Star Kid (1997) 5.001694543358842
1200 Marlene Dietrich: Shadow and Light (1996) 5.00149967272587
1499 Santa with Muscles (1996) 5.001334854576942
1121 They Made Me a Criminal (1939) 5.000999967820586
1652 Entertaining Angels: The Dorothy Day Story (1996) 5.000975637072753
813 Great Day in Harlem, A (1994) 5.000799086576183
1598 Someone Else's America (1995) 5.0006016104689905
1188 Prefontaine (1997) 5.000467537343162
1535 Aiqing wansui (1994) 5.000239077425466
