# Домашнее задание "Введение и классификация рекомендательных систем"

Владимир Никифоров

Постройте топ фильмов в категориях Action и Comedy (или любых других).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [2]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [6]:
'Adventure' in 'Adventure|Animation|Children|Comedy|Fantasy'

True

In [7]:
# поставим метки фильмам Action и Comedy
joined_ratings['IsAction'] = joined_ratings['genres'].apply(lambda x: 'Action' in x)
joined_ratings['IsComedy'] = joined_ratings['genres'].apply(lambda x: 'Comedy' in x)

In [8]:
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,IsAction,IsComedy
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,False,True
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,False,True
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,True,False
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,False,False
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,False,False


### Идея: найти "боевых кинокритиков" (кто очень много смотрит) и сортировать фильмы по следующей метрике по рейтингам этих кинокритиков: средняя оценка фильма, умноженная на нормированное количество рейтингов

In [9]:
# достанем топ 50 пользователей по количеству оценок
top_users = joined_ratings.groupby('userId')[['rating']].count().sort_values('rating', ascending=False)
top_users = top_users.reset_index().head(50)['userId'].values

In [10]:
# вот наши "кинокритики - киноманы"
top_users

array([414, 599, 474, 448, 274, 610,  68, 380, 606, 288, 249, 387, 182,
       307, 603, 298, 177, 318, 232, 480, 608, 600, 483, 590, 105,  19,
       305, 489, 111, 438, 217, 140, 477, 555,  91,  28, 219, 534,  89,
        64, 226, 561,  18, 525,  57, 381, 368, 509, 469, 560])

In [11]:
# отфильтруем только оценки наших киноманов и только фильмы Action
joined_ratings = joined_ratings[joined_ratings['userId'].isin(top_users) & (joined_ratings['IsAction']==True)]

In [12]:
# достанем по каждому фильму количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    title_num_ratings[title] = group.userId.unique().shape[0]

HBox(children=(IntProgress(value=0, max=1692), HTML(value='')))




In [13]:
# достанем простые статистики по количеству рейтингов
min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])

In [14]:
# считаем средний рейтинг на каждый фильм
title_mean_rating = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    title_mean_rating[title] = group.rating.mean()

HBox(children=(IntProgress(value=0, max=1692), HTML(value='')))




In [15]:
film_with_our_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in title_num_ratings.keys():
    film_with_our_mark.append(
        (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [16]:
# выводим топ 20 и получилось уже очень неплохо
list(sorted(film_with_our_mark, key=lambda x: x[1], reverse=True))[:20]

[('Matrix, The (1999)', 3.6045608033402488),
 ('Star Wars: Episode IV - A New Hope (1977)', 3.532927690592358),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 3.4659937964676057),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 3.2774748817966906),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  3.227950773246651),
 ('Fight Club (1999)', 2.9583071685382194),
 ('Jurassic Park (1993)', 2.878628732160056),
 ('Die Hard (1988)', 2.856822743167491),
 ('Kill Bill: Vol. 1 (2003)', 2.7965715334236676),
 ('Terminator 2: Judgment Day (1991)', 2.762570663716389),
 ('Lord of the Rings: The Return of the King, The (2003)', 2.6970220911377822),
 ('Saving Private Ryan (1998)', 2.6633050980361666),
 ('Terminator, The (1984)', 2.6520678371677335),
 ('Indiana Jones and the Last Crusade (1989)', 2.638208525287426),
 ('Ghostbusters (a.k.a. Ghost Busters) (1984)', 2.569501083530339),
 ('Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 

In [17]:
# Обернем все в функцию для удобного поиска топовых фильмов по топ Х киноманов любого жанра :)
def get_top_films(df,
                  genre = 'Action',
                  top_users = 100,
                  top_films = 20
                 ):
    # поставим метки фильмам
    df['IsGenre'] = df['genres'].apply(lambda x: genre in x)
    # достанем топ пользователей по количеству оценок
    top_users_df = df.groupby('userId')[['rating']].count().sort_values('rating', ascending=False)
    top_users_list = top_users_df.reset_index()[:top_users]['userId'].values
    # отфильтруем только оценки наших киноманов и только фильмы Action
    df = df[df['userId'].isin(top_users_list) & (df['IsGenre']==True)]

    # достанем по каждому фильму количество рейтингов
    title_num_ratings = {}
    for title, group in tqdm_notebook(df.groupby('title')):
        title_num_ratings[title] = group.userId.unique().shape[0]

    # достанем простые статистики по количеству рейтингов
    min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
    max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
    mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])

    # считаем средний рейтинг на каждый фильм
    title_mean_rating = {}
    for title, group in tqdm_notebook(df.groupby('title')):
        title_mean_rating[title] = group.rating.mean()

    film_with_our_mark = []
    # посчитаем нашу метрику для каждого фильма из датасета
    for f in title_num_ratings.keys():
        film_with_our_mark.append(
            (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
        )

    # выводим топ 20 и получилось уже очень неплохо
    return list(sorted(film_with_our_mark, key=lambda x: x[1], reverse=True))[:top_films]

In [18]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [19]:
# Итак, Actions!
get_top_films(joined_ratings, genre='Action', top_users=50, top_films=20)

HBox(children=(IntProgress(value=0, max=1692), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1692), HTML(value='')))




[('Matrix, The (1999)', 3.6045608033402488),
 ('Star Wars: Episode IV - A New Hope (1977)', 3.532927690592358),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 3.4659937964676057),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 3.2774748817966906),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  3.227950773246651),
 ('Fight Club (1999)', 2.9583071685382194),
 ('Jurassic Park (1993)', 2.878628732160056),
 ('Die Hard (1988)', 2.856822743167491),
 ('Kill Bill: Vol. 1 (2003)', 2.7965715334236676),
 ('Terminator 2: Judgment Day (1991)', 2.762570663716389),
 ('Lord of the Rings: The Return of the King, The (2003)', 2.6970220911377822),
 ('Saving Private Ryan (1998)', 2.6633050980361666),
 ('Terminator, The (1984)', 2.6520678371677335),
 ('Indiana Jones and the Last Crusade (1989)', 2.638208525287426),
 ('Ghostbusters (a.k.a. Ghost Busters) (1984)', 2.569501083530339),
 ('Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 

In [20]:
# Comedy
get_top_films(joined_ratings, genre='Comedy', top_users=100, top_films=20)

HBox(children=(IntProgress(value=0, max=3579), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3579), HTML(value='')))




[('Forrest Gump (1994)', 3.7831057251660605),
 ('Pulp Fiction (1994)', 3.6618024450292146),
 ('Back to the Future (1985)', 2.9482724292484126),
 ('Monty Python and the Holy Grail (1975)', 2.9089763045882226),
 ('Groundhog Day (1993)', 2.8724301308488047),
 ('Fargo (1996)', 2.8691194474477393),
 ('Toy Story (1995)', 2.7985254105010653),
 ('Truman Show, The (1998)', 2.6293613815502317),
 ('Men in Black (a.k.a. MIB) (1997)', 2.5871585243236908),
 ("Ferris Bueller's Day Off (1986)", 2.534442297060925),
 ('Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
  2.4996737862024068),
 ('Princess Bride, The (1987)', 2.4916077774864833),
 ('Shrek (2001)', 2.485027260111377),
 ('Ghostbusters (a.k.a. Ghost Busters) (1984)', 2.3457718099168225),
 ('Breakfast Club, The (1985)', 2.2407240591499833),
 ('Big Lebowski, The (1998)', 2.215846783156855),
 ('Finding Nemo (2003)', 2.211822377238221),
 ('Fifth Element, The (1997)', 2.211811490644822),
 ('Monsters, Inc. (2001)', 2.211811490644822),
