In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_4.txt
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/probe.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/qualifying.txt


In [2]:
from collections import deque
ds_raw = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
tmp_movies = ds_raw[ds_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)

user_data = []

for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    if df_id_1<df_id_2:
        tmp_df = ds_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = ds_raw.loc[df_id_1+1:].copy()
        
    tmp_df['Movie'] = movie_id
    user_data.append(tmp_df)

ds = pd.concat(user_data)
del user_data, ds_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
ds = ds.head(10000)

In [3]:
ds.head(10)

Unnamed: 0,User,Rating,Date,Movie
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1
6,893988,3.0,2005-11-17,1
7,124105,4.0,2004-08-05,1
8,1248029,3.0,2004-04-22,1
9,1842128,4.0,2004-05-09,1
10,2238063,3.0,2005-05-11,1


In [4]:
ds.shape

(10000, 4)

In [5]:
ds['Movie'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8])

In [6]:
users = ds['User'].unique()

In [7]:
rate_matrix = np.zeros((len(users),9))

In [12]:
for i in range(len(users)): 
    user_index = users[i]
    local_ds = ds.loc[ds['User'] == user_index]
    num = local_ds.shape[0]
    for j in range(num):
        k = list(local_ds['Movie'])[j]
        rate_matrix[i][k] = list(local_ds['Rating'])[j]
print(rate_matrix[:5])

[[0. 3. 0. 0. 0. 0. 0. 0. 4.]
 [0. 5. 0. 0. 0. 0. 0. 0. 0.]
 [0. 4. 0. 0. 0. 5. 0. 0. 0.]
 [0. 4. 0. 0. 0. 1. 0. 0. 0.]
 [0. 3. 0. 0. 0. 0. 0. 0. 1.]]


In [51]:
from scipy.linalg import svd 
U, s, V = svd(rate_matrix)

In [52]:
print("Выберем 3-го пользователя:")
user = rate_matrix[2]
print(user)

Выберем 3-го пользователя:
[0. 4. 0. 0. 0. 5. 0. 0. 0.]


In [53]:
U = U[:5]
V = V[:5] 
lowdim = np.dot(user, V.transpose())
inversed_transformation = np.dot(lowdim, V)
print("Вектор оценок пользователя 3:", inversed_transformation)
maximum = 0
maximum_i = 0
for i in range(len(user)): 
    if user[i] == 0 and inversed_transformation[i] > maximum:
        maximum = inversed_transformation[i]
        maximum_i = i 
print("Индекс рекомендуемого фильма для пользователя 3:", maximum_i)

Вектор оценок пользователя 3: [ 0.00000000e+00  3.99819293e+00  5.88965393e-02 -3.28617682e-04
  5.61300975e-02  4.99930255e+00 -8.60336160e-04  6.39652803e-02
 -6.93954789e-05]
Индекс рекомендуемого фильма для пользователя 3: 7


In [54]:
U, s, V = svd(rate_matrix)
U = U[:7]
V = V[:7]
new_user = np.array((0,0,3,4,0,5,0,0,1))
print("Новый пользователь:", new_user)
lowdim = np.dot(new_user, V.transpose())
print("Представление new_user в сниженной размерности:", lowdim)
inversed_transformation = np.dot(lowdim, V) 
print("Вектор оценок new_user:", inversed_transformation) 
maximum = 0
for i in range(len(new_user)): 
  if new_user[i] == 0 and inversed_transformation[i] > maximum:
    maximum = inversed_transformation[i]
    maximum_i = i 
print("Индекс рекомендуемого фильма для нового пользователя:", maximum_i)

Новый пользователь: [0 0 3 4 0 5 0 0 1]
Представление new_user в сниженной размерности: [ 1.13185339 -4.05157529  4.94033693 -0.20867621  0.13585678  2.96774209
  0.13512097]
Вектор оценок new_user: [ 0.00000000e+00 -9.06081762e-04  2.99792225e+00  3.99985723e+00
 -2.68301720e-03  4.99956871e+00 -3.21335400e-04  9.46061448e-02
  9.99997822e-01]
Индекс рекомендуемого фильма для нового пользователя: 7
