# Подготовка к работе

In [1]:
# скачиваем данные

import requests
import zipfile
import io

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip"

response = requests.get(url)

with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    zip_ref.extractall(".")

In [2]:
# подключаем нужные нас библиотеки
# необходимый минимум

import pandas as pd
from math import sqrt
import numpy as np

In [3]:
# загружаем датасеты
# 'ml-latest'
# папка может измениться, проверяйте перед запуском

movies_df = pd.read_csv('ml-latest/movies.csv')
ratings_df = pd.read_csv('ml-latest/ratings.csv')

# Предпросмотр имеющихся данных

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


# Предобработка данных

In [6]:
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df['title'] = movies_df['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)
movies_df['title'] = movies_df['title'].str.strip()
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
# удаляем жанры для упрощения

movies_df = movies_df.drop('genres', axis=1)

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


# Создаем входные данные пользователя

In [9]:
userInput = [
    {'title':'Breakfast Club, The', 'rating':5},
    {'title':'Toy Story', 'rating':3.5},
    {'title':'Jumanji', 'rating':2},
    {'title':"Pulp Fiction", 'rating':5},
    {'title':'Akira', 'rating':4.5}
] 

inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


# Теперь переходим к основной работе

In [10]:
# найдём id фильмов

inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputId

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
293,296,Pulp Fiction,1994
1246,1274,Akira,1988
1885,1968,"Breakfast Club, The",1985


In [11]:
# находим доп информацию о фильмах полученных от пользователя 

inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [12]:
# находим пользователей, которые смотрели такие же фильмы

userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating,timestamp
19,4,296,4.0,1037741922
441,12,1968,3.0,953778911
479,13,2,2.0,974867742
531,13,1274,5.0,974870639
681,14,296,2.0,845469280


In [13]:
# группируем пользователей
userSubsetGroup = userSubset.groupby(['userId'])

# Посмотрим на одного пользователя
userSubsetGroup.get_group(1130)

  userSubsetGroup.get_group(1130)


Unnamed: 0,userId,movieId,rating,timestamp
104167,1130,1,0.5,1246793380
104168,1130,2,4.0,1293623928
104214,1130,296,4.0,1246788859
104363,1130,1274,4.5,1292762481
104443,1130,1968,4.5,1396902829


In [14]:
# вычисляем корреляцию Пирсона между пользователями
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
    # сортируем данные
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    # получаем количество оценок
    nRatings = len(group)
    
    # получаем рейтинги для общих фильмов
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    
    # вычисляем корреляцию Пирсона
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum(i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    # проверяем деление на ноль
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [15]:
# преобразуем словарь в DataFrame

pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.0,"(4,)"
1,0.0,"(12,)"
2,1.0,"(13,)"
3,0.0,"(14,)"
4,-1.0,"(15,)"


In [16]:
# аолучаем топ пользователей с наибольшей корреляцией

topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
89860,1.0,"(191754,)"
79741,1.0,"(170013,)"
93744,1.0,"(199759,)"
100340,1.0,"(213732,)"
88420,1.0,"(188698,)"


In [17]:
# преобразуем кортежи в обычные значения
topUsers['userId'] = topUsers['userId'].apply(lambda x: x[0] if isinstance(x, tuple) else x)

# теперь преобразуем к целочисленному типу
topUsers['userId'] = topUsers['userId'].astype(int)
ratings_df['userId'] = ratings_df['userId'].astype(int)

# Теперь объединяем
topUsersRating = topUsers.merge(ratings_df, on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,1.0,191754,1,3.5,1171513067
1,1.0,191754,6,3.5,1171512388
2,1.0,191754,10,3.0,1171513124
3,1.0,191754,16,4.0,1171512559
4,1.0,191754,47,4.0,1171512270


In [18]:
# умножаем рейтинг на коэффициент схожести

topUsersRating['weightedRating'] = topUsersRating['similarityIndex'] * topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp,weightedRating
0,1.0,191754,1,3.5,1171513067,3.5
1,1.0,191754,6,3.5,1171512388,3.5
2,1.0,191754,10,3.0,1171513124,3.0
3,1.0,191754,16,4.0,1171512559,4.0
4,1.0,191754,47,4.0,1171512270,4.0


In [19]:
# суммируем взвешенные рейтинги по movieId

tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex', 'sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,48.0,177.0
2,2.0,6.5
3,4.0,14.0
4,1.0,2.0
5,6.0,12.5


In [20]:
# создаем DataFrame рекомендаций

recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.6875,1
2,3.25,2
3,3.5,3
4,2.0,4
5,2.083333,5


In [21]:
# сортируем по убыванию рейтинга

recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7767,5.0,7767
8254,5.0,8254
2024,5.0,2024
26326,5.0,26326
140816,5.0,140816
9010,5.0,9010
2166,5.0,2166
8772,5.0,8772
26736,5.0,26736
6477,5.0,6477


In [None]:
# кдалим дублирующий столбец movieId перед сбросом индекса
recommendation_df_fixed = recommendation_df.copy()
recommendation_df_fixed = recommendation_df_fixed.reset_index(drop=True)  # сбрасываем индекс без сохранения

# теперь объединяем
recommended_movies = recommendation_df_fixed.merge(movies_df, on='movieId', how='inner')
recommended_movies.head(10)

Unnamed: 0,weighted average recommendation score,movieId,title,year
0,5.0,7767,"Best of Youth, The (La meglio gioventù)",2003
1,5.0,8254,Arizona Dream,1993
2,5.0,2024,"Rapture, The",1991
3,5.0,26326,"Holy Mountain, The (Montaña sagrada, La)",1973
4,5.0,140816,Tangerine,2015
5,5.0,9010,Love Me If You Dare (Jeux d'enfants),2003
6,5.0,2166,Return to Paradise,1998
7,5.0,8772,"Spy Who Came in from the Cold, The",1965
8,5.0,26736,Riki-Oh: The Story of Ricky (Lik Wong),1991
9,5.0,6477,"Song of Bernadette, The",1943
