In [1]:
import pandas as pd
import matplotlib.pyplot as plt

Постройте топ фильмов в категориях Action и Comedy

In [68]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [69]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [70]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [71]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [72]:
rating_set = set(ratings['movieId'].unique())
movie_set = set(movies['movieId'].unique())
tags_set = set(tags['movieId'].unique())

In [73]:
len(rating_set), len(movie_set), len(tags_set)

(9724, 9742, 1572)

In [74]:
rating_set.issubset(movie_set)

True

In [75]:
tags_set.issubset(movie_set)

True

Фильмов с оценками немного меньше, чем фильмов без. Фильмов, которым проставили тэги, гораздо меньше, чем фильмов без тэгов.

In [76]:
def normed_user_rate_func(x):
    """
    Функция для нормирования оценки пользователя
    """
    if x.max() != x.min():
        return (x - x.min())/(x.max() - x.min())
    else:
        return 1
    
ratings['normed_user_rating'] = ratings.groupby('userId', sort=False)['rating'].transform(normed_user_rate_func)

In [77]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,normed_user_rating
0,1,1,4.0,964982703,0.75
1,1,3,4.0,964981247,0.75
2,1,6,4.0,964982224,0.75
3,1,47,5.0,964983815,1.0
4,1,50,5.0,964982931,1.0


In [78]:
# количество оценок по фильмам
movies_rating_count = ratings.groupby('movieId', sort=False)['rating'].count()
# средняя нормированная оценка пользователя для каждого фильма
movies_mean_rating = ratings.groupby('movieId', sort=False)['normed_user_rating'].mean()

In [79]:
# среднее количество оценок фильма
mean_num_rate = movies_rating_count.mean()
# медианное количество оценок фильма
median_num_rate = movies_rating_count.median()
min_num_rate = movies_rating_count.min()
max_num_rate = movies_rating_count.max()
std_num_rate = movies_rating_count.std()

In [80]:
mean_num_rate, median_num_rate, min_num_rate, max_num_rate, std_num_rate

(10.369806663924312, 3.0, 1, 329, 22.401004809608246)

In [81]:
# нормированная средняя оценка фильма
movies_mean_normed_rating = pd.DataFrame(movies_mean_rating*
        (movies_rating_count - mean_num_rate)/std_num_rate, columns=['movie_normed_mean_rating'])

In [82]:
movies_mean_normed_rating.head()

Unnamed: 0_level_0,movie_normed_mean_rating
movieId,Unnamed: 1_level_1
1,6.584792
3,0.977352
6,2.961041
47,6.282596
50,6.902174


In [83]:
movies = pd.merge(movies, movies_mean_normed_rating, left_on='movieId', right_index=True)

In [84]:
movies.head()

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.584792
1,2,Jumanji (1995),Adventure|Children|Fantasy,2.674611
2,3,Grumpier Old Men (1995),Comedy|Romance,0.977352
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,-0.042383
4,5,Father of the Bride Part II (1995),Comedy,0.748352


In [85]:
# топ 20 фильмов без привязки к жанру (по нормированному рейтингу)
movies.sort_values('movie_normed_mean_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,11.620828
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,11.159282
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,10.37557
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,9.50525
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,9.455111
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,8.634377
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,7.568651
97,110,Braveheart (1995),Action|Drama|War,7.554489
461,527,Schindler's List (1993),Drama|War,7.489698
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,7.155559


In [98]:
# количество тэгов для каждого фильма
movies_tag_count = pd.DataFrame(tags.groupby('movieId')['tag'].count())
movies_tag_count.columns = ['num_of_tags']

In [99]:
movies_tag_count.head()

Unnamed: 0_level_0,num_of_tags
movieId,Unnamed: 1_level_1
1,3
2,4
3,2
5,2
7,1


In [100]:
# основные статистики по количеству тэгов
mean_tags_num = movies_tag_count.mean()[0]
median_tags_num = movies_tag_count.median()[0]
min_tags_num = movies_tag_count.min()[0]
max_tags_num = movies_tag_count.max()[0]
std_tags_num = movies_tag_count.std()[0]

In [101]:
mean_tags_num, median_tags_num, min_tags_num, max_tags_num, std_tags_num

(2.3428753180661577, 1.0, 1, 181, 5.562342135655902)

In [102]:
movies_tag_count = pd.merge(movies_tag_count, pd.DataFrame(ratings.groupby('movieId', sort=False)['rating'].mean()), left_index=True, right_index=True)

In [103]:
movies_tag_count.head()

Unnamed: 0_level_0,num_of_tags,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,3.92093
2,4,3.431818
3,2,3.259615
5,2,3.071429
7,1,3.185185


In [104]:
movies_tag_count.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1554 entries, 1 to 193565
Data columns (total 2 columns):
num_of_tags    1554 non-null int64
rating         1554 non-null float64
dtypes: float64(1), int64(1)
memory usage: 36.4 KB


In [111]:
# нормированная по количеству тэгов средний рейтинг фильма.
movies_tag_count['normed_by_tags_movie_rating'] = (movies_tag_count['rating']*
        (movies_tag_count['num_of_tags'] - mean_tags_num)/std_tags_num)

In [106]:
movies_tag_count.head()

Unnamed: 0_level_0,num_of_tags,rating,normed_by_tags_movie_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,3.92093,0.463211
2,4,3.431818,1.022402
3,2,3.259615,-0.20093
5,2,3.071429,-0.18933
7,1,3.185185,-0.768976


In [107]:
movies_tags = pd.merge(movies, movies_tag_count, left_on='movieId', right_index=True)

In [108]:
movies_tags.head()

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating,num_of_tags,rating,normed_by_tags_movie_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.584792,3,3.92093,0.463211
1,2,Jumanji (1995),Adventure|Children|Fantasy,2.674611,4,3.431818,1.022402
2,3,Grumpier Old Men (1995),Comedy|Romance,0.977352,2,3.259615,-0.20093
4,5,Father of the Bride Part II (1995),Comedy,0.748352,2,3.071429,-0.18933
6,7,Sabrina (1995),Comedy|Romance,0.947953,1,3.185185,-0.768976


In [112]:
# топ 20 фильмов без привязки к жанру (рейтинг нормирован по количеству тэгов)
movies_tags.sort_values('normed_by_tags_movie_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating,num_of_tags,rating,normed_by_tags_movie_rating
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,10.37557,181,4.197068,134.805834
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,7.568651,54,4.272936,39.682488
706,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,3.230359,41,3.894495,27.065936
4909,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,4.28152,34,4.160305,23.677671
254,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,4.061539,35,4.018797,23.594801
1298,1732,"Big Lebowski, The (1998)",Comedy|Crime,3.145089,32,3.924528,20.924679
3562,4878,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller,3.313038,29,3.981651,19.081778
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,8.634377,26,4.231076,17.995133
7372,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,4.453425,26,4.066434,17.294895
3087,4144,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance,0.133599,18,4.214286,11.862556


При использовании оценки нормированной по количеству тэгов в топ попадают не только популярные фильмы но и артхаусное кино, которое понравиться далеко не всем (такие как: Голова-ластик, Донни Дарко, 2001 космическая одиссея и др.).

Теперь посмотрим на топ фильмов по жанрам экшн и комедия.

In [113]:
actions = movies[movies['genres'].str.lower().str.contains('action')]
comedies = movies[movies['genres'].str.lower().str.contains('comedy')]

actions_tags = movies_tags[movies_tags['genres'].str.lower().str.contains('action')]
comedies_tags = movies_tags[movies_tags['genres'].str.lower().str.contains('comedy')]

### Топ 20 экшн фильмов согласно нормированному рейтингу

In [115]:
actions.sort_values('movie_normed_mean_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,9.50525
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,8.634377
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,7.568651
97,110,Braveheart (1995),Action|Drama|War,7.554489
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,7.155559
507,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,6.95611
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,6.860897
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,6.803965
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,6.494378
1503,2028,Saving Private Ryan (1998),Action|Drama|War,6.176242


### Топ 20 комедийных фильмов согласно нормированному рейтингу

In [116]:
comedies.sort_values('movie_normed_mean_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,11.159282
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,10.37557
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.584792
520,608,Fargo (1996),Comedy|Crime|Drama|Thriller,5.908061
969,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,5.466645
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,5.269003
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,5.169215
899,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4.74926
337,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,4.464202
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.445469


### Топ 20 экшн фильмов согласно нормированному по количеству тэгов рейтингу

In [117]:
actions_tags.sort_values('normed_by_tags_movie_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating,num_of_tags,rating,normed_by_tags_movie_rating
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,7.568651,54,4.272936,39.682488
254,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,4.061539,35,4.018797,23.594801
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,8.634377,26,4.231076,17.995133
7372,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,4.453425,26,4.066434,17.294895
7212,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,2.523566,18,3.603093,10.142144
8693,122912,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi,0.087383,15,4.0,9.102011
8915,135536,Suicide Squad (2016),Action|Crime|Sci-Fi,0.038998,19,2.916667,8.734321
474,541,Blade Runner (1982),Action|Sci-Fi|Thriller,3.962231,13,4.100806,7.856907
8063,99114,Django Unchained (2012),Action|Drama|Western,1.962576,12,3.943662,6.846834
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,7.155559,10,4.21564,5.803253


### Топ 20 комедийных фильмов согласно нормированному по количеству тэгов рейтингу

In [118]:
comedies_tags.sort_values('normed_by_tags_movie_rating', ascending=False).head(20)

Unnamed: 0,movieId,title,genres,movie_normed_mean_rating,num_of_tags,rating,normed_by_tags_movie_rating
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,10.37557,181,4.197068,134.805834
1298,1732,"Big Lebowski, The (1998)",Comedy|Crime,3.145089,32,3.924528,20.924679
7166,71899,Mary and Max (2009),Animation|Comedy|Drama,-0.01323,13,4.2,8.046956
4015,5673,Punch-Drunk Love (2002),Comedy|Drama|Romance,0.681031,13,3.621212,6.938032
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,3.169942,10,4.268041,5.875389
6836,61323,Burn After Reading (2008),Comedy|Crime|Drama,0.831634,11,3.487179,5.427381
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,11.159282,9,4.164134,4.98372
6676,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller,1.076139,8,4.158537,4.229398
1730,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,2.753583,8,4.147727,4.218405
2028,2700,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical,2.145329,8,3.861842,3.927648
