In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [44]:
# Функци по формированию рейтинга фильмов, с выбором жанра
# При параметре use_tag = True выводит рейтинг с учетом тэгов !!!

def score(genre, use_tag = False):

    movies_g = movies[movies['genres'].str.contains(genre, na=False)]
    movies_g.set_index('movieId')
    
    # соединим таблицу с рейтингами и названиями фильмов
    joined_ratings = movies_g.join(ratings.set_index('movieId'), on='movieId')
    if use_tag:
        joined_ratings = joined_ratings.join(tags.set_index('movieId'), on='movieId', lsuffix='_left', rsuffix='')
    
    # достанем по каждому фильму количество рейтингов
    title_num_ratings = {}

    for title, group in tqdm_notebook(joined_ratings.groupby('title')):
        title_num_ratings[title] = group.userId.unique().shape[0]
    
    # считаем средний рейтинг на каждый фильм
    title_mean_rating = {}

    for title, group in tqdm_notebook(joined_ratings.groupby('title')):
        title_mean_rating[title] = group.rating.mean()
    
    min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
    max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
    mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])
    median_num_ratings = np.median([title_num_ratings[f] for f in title_num_ratings.keys()])
    
    film_with_our_mark = {}

    # посчитаем нашу метрику для каждого фильма из датасета
    for f in title_num_ratings.keys():
        film_with_our_mark[f] = title_mean_rating[f] * (title_num_ratings[f] - min_num_ratings) / (max_num_ratings - min_num_ratings)
    
    film_with_our_mark_list = [(k,film_with_our_mark[k]) for k in film_with_our_mark.keys()]
    
    return list(sorted(film_with_our_mark_list, key=lambda x: x[1], reverse=True))[:20]

    

In [45]:
score('Comedy', True)

HBox(children=(IntProgress(value=0, max=3755), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3755), HTML(value='')))




[('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  4.268041237113402),
 ('Anchorman: The Legend of Ron Burgundy (2004)', 2.8289473684210527),
 ('Forrest Gump (1994)', 2.082066869300912),
 ('Toy Story (1995)', 1.9604651162790698),
 ('Step Brothers (2008)', 1.7767857142857142),
 ('Corpse Bride (2005)', 1.7670454545454546),
 ('Happy Gilmore (1996)', 1.7196969696969697),
 ('Monty Python and the Holy Grail (1975)', 1.0404411764705883),
 ('Life Is Beautiful (La Vita è bella) (1997)', 1.0369318181818181),
 ('Fargo (1996)', 1.0290055248618784),
 ('Kiss Kiss Bang Bang (2005)', 1.0178571428571428),
 ('Trainspotting (1996)', 1.0098039215686274),
 ('Lost in Translation (2003)', 1.008445945945946),
 ('Three Colors: White (Trzy kolory: Bialy) (1994)', 1.00625),
 ('This Is Spinal Tap (1984)', 1.003787878787879),
 ('Finding Nemo (2003)', 0.9902482269503546),
 ('Big Lebowski, The (1998)', 0.9811320754716981),
 ('Wolf of Wall Street, The (2013)', 0.9791666666666666),
 ('T

In [46]:
score('Action', True)

HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




[('Star Wars: Episode IV - A New Hope (1977)', 4.231075697211155),
 ('Fight Club (1999)', 1.4243119266055047),
 ('Blade Runner (1982)', 1.3669354838709675),
 ('Inception (2010)', 1.3554778554778555),
 ('Matrix, The (1999)', 0.9316546762589929),
 ('City of God (Cidade de Deus) (2002)', 0.9214814814814815),
 ('Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  0.8930659983291562),
 ('Terminator 2: Judgment Day (1991)', 0.8824404761904762),
 ('Dark Knight, The (2008)', 0.47091722595078295),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)',
  0.46840442338072674),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  0.46749999999999997),
 ('North by Northwest (1959)', 0.4649122807017544),
 ('John Wick: Chapter Two (2017)', 0.46031746031746035),
 ('Lord of the Rings: The Return of the King, The (2003)',
  0.45765765765765765),
 ('Braveheart (1995)', 0.4479606188466948),
 ('Avengers: Infinity War - Part I (2018)', 0.4444444444444444),

In [47]:
score('Comedy')

HBox(children=(IntProgress(value=0, max=3755), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3755), HTML(value='')))




[('Forrest Gump (1994)', 4.164133738601824),
 ('Aladdin (1992)', 2.104291616686659),
 ('Back to the Future (1985)', 2.0928719155612607),
 ("Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)", 1.5177337398373985),
 ('Ace Ventura: Pet Detective (1994)', 1.4831086199060748),
 ('Breakfast Club, The (1985)', 1.2903086552989425),
 ('Big Lebowski, The (1998)', 1.2563276576161988),
 ('Clerks (1994)', 1.2108055816135084),
 ('Batman Forever (1995)', 1.2090973829446323),
 ('Being John Malkovich (1999)', 1.1815410199556542),
 ('Austin Powers: The Spy Who Shagged Me (1999)', 1.1701269905261036),
 ('As Good as It Gets (1997)', 1.07104293699187),
 ('Austin Powers: International Man of Mystery (1997)', 1.0669664634146343),
 ('American Pie (1999)', 1.0506748756807955),
 ("Bug's Life, A (1998)", 0.9755600477200425),
 ('Blues Brothers, The (1980)', 0.9639953542392565),
 ('Blazing Saddles (1974)', 0.731904012588513),
 ('Bruce Almighty (2003)', 0.7078753005839917),
 ("Bridget Jones's Diary (2001)", 0.70

In [48]:
score('Action')

HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




[('Matrix, The (1999)', 4.192446043165468),
 ('Star Wars: Episode IV - A New Hope (1977)', 3.8186603765443636),
 ('Braveheart (1995)', 3.434903806607869),
 ('Fight Club (1999)', 3.3473901235385686),
 ('Jurassic Park (1993)', 3.2084837545126352),
 ('Terminator 2: Judgment Day (1991)', 3.19685566013409),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 3.195972419456944),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  3.0227166064981947),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 2.912860089884329),
 ('Saving Private Ryan (1998)', 2.7991109148168056),
 ('Lord of the Rings: The Return of the King, The (2003)', 2.736032783686213),
 ('Independence Day (a.k.a. ID4) (1996)', 2.5001965900561176),
 ('Gladiator (2000)', 2.4027500530898283),
 ('Batman (1989)', 2.3269726663228467),
 ('Dark Knight, The (2008)', 2.264482833813873),
 ('True Lies (1994)', 2.2346671804648524),
 ('Speed (1994)', 2.1659594232271413),
 ('Princess Bride, The (19