In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [8]:
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [17]:
comedy_ratings = joined_ratings[['Comedy' in x for x in joined_ratings.genres]]

In [18]:
comedy_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
6,1,101,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
9,1,157,5.0,964984100,Canadian Bacon (1995),Comedy|War


In [19]:
# достанем по каждому фильму количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(comedy_ratings.groupby('title')):
    title_num_ratings[title] = group.userId.unique().shape[0]

HBox(children=(IntProgress(value=0, max=3752), HTML(value='')))




In [20]:
# достанем простые статистики по количеству рейтингов
min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])
median_num_ratings = np.median([title_num_ratings[f] for f in title_num_ratings.keys()])

In [21]:
print(min_num_ratings)
print(max_num_ratings)
print(mean_num_ratings)
print(median_num_ratings)

1
329
10.408315565031982
3.0


In [22]:
# считаем средний рейтинг на каждый фильм
title_mean_rating = {}

for title, group in tqdm_notebook(comedy_ratings.groupby('title')):
    title_mean_rating[title] = group.rating.mean()

HBox(children=(IntProgress(value=0, max=3752), HTML(value='')))




In [23]:
film_with_our_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in title_num_ratings.keys():
    film_with_our_mark.append(
        (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [24]:
# выводим топ 20 Комедий
list(sorted(film_with_our_mark, key=lambda x: x[1], reverse=True))[:20]

[('Forrest Gump (1994)', 4.044690189004988),
 ('Pulp Fiction (1994)', 3.7951694744027207),
 ('Toy Story (1995)', 2.44570036838738),
 ('Fargo (1996)', 2.1407290948667828),
 ('Aladdin (1992)', 1.995512278386442),
 ('Back to the Future (1985)', 1.9770460366271827),
 ('Shrek (2001)', 1.8818424052365759),
 ('True Lies (1994)', 1.7868906474508017),
 ('Princess Bride, The (1987)', 1.6980119019541347),
 ('Men in Black (a.k.a. MIB) (1997)', 1.6438934662291962),
 ('Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
  1.5965607778868676),
 ('Groundhog Day (1993)', 1.5943567717320903),
 ('Monty Python and the Holy Grail (1975)', 1.5935458525419635),
 ('Finding Nemo (2003)', 1.5770510239778939),
 ('Monsters, Inc. (2001)', 1.4350829348269738),
 ('Mask, The (1994)', 1.4233307871967535),
 ("Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)", 1.397739471198017),
 ('Ace Ventura: Pet Detective (1994)', 1.3958989079479782),
 ('Mrs. Doubtfire (1993)', 1.380266387285408),
 ('Incredibles, The

In [25]:
action_ratings = joined_ratings[['Action' in x for x in joined_ratings.genres]]

In [26]:
action_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
7,1,110,4.0,964982176,Braveheart (1995),Action|Drama|War
8,1,151,5.0,964984041,Rob Roy (1995),Action|Drama|Romance|War
10,1,163,5.0,964983650,Desperado (1995),Action|Romance|Western


In [27]:
# достанем по каждому фильму количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(action_ratings.groupby('title')):
    title_num_ratings[title] = group.userId.unique().shape[0]

HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




In [28]:
# достанем простые статистики по количеству рейтингов
min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])
median_num_ratings = np.median([title_num_ratings[f] for f in title_num_ratings.keys()])

In [29]:
print(min_num_ratings)
print(max_num_ratings)
print(mean_num_ratings)
print(median_num_ratings)

1
278
16.766830870279147
5.0


In [30]:
# считаем средний рейтинг на каждый фильм
title_mean_rating = {}

for title, group in tqdm_notebook(action_ratings.groupby('title')):
    title_mean_rating[title] = group.rating.mean()

HBox(children=(IntProgress(value=0, max=1827), HTML(value='')))




In [31]:
film_with_our_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in title_num_ratings.keys():
    film_with_our_mark.append(
        (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [32]:
# выводим топ 20 Экшенов
list(sorted(film_with_our_mark, key=lambda x: x[1], reverse=True))[:20]

[('Matrix, The (1999)', 3.9538121525684975),
 ('Star Wars: Episode IV - A New Hope (1977)', 3.5778276873123183),
 ('Braveheart (1995)', 3.205422673665224),
 ('Fight Club (1999)', 3.1041747597843043),
 ('Jurassic Park (1993)', 2.9950338781099397),
 ('Terminator 2: Judgment Day (1991)', 2.9708274874433775),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 2.9560183403919225),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  2.78322584517437),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 2.6773392058535994),
 ('Saving Private Ryan (1998)', 2.5631049876453145),
 ('Lord of the Rings: The Return of the King, The (2003)', 2.5015840545779544),
 ('Independence Day (a.k.a. ID4) (1996)', 2.304076307579185),
 ('Gladiator (2000)', 2.178585829950056),
 ('Batman (1989)', 2.1318184936118105),
 ('True Lies (1994)', 2.035607183370203),
 ('Dark Knight, The (2008)', 2.023241496993645),
 ('Speed (1994)', 1.9650752120629666),
 ('Princess Bride, The (1