In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [12]:
data = pd.read_csv('../data/netflix.csv')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [13]:
flags = data['cast'].isna()
flags.sum() / len(data)

0.09220495697958135

В данных об актерах около 9% процентов пропусков, поэтому проигнорируем данные с пропусками.

In [14]:
data = data.dropna(subset=['cast', 'director'])

Так как в фильах и сериалах играют несколько актеров, то они указаны вместе. Разделим их для подсчета:

In [15]:
data['cast'] = data['cast'].map(lambda names: [name.strip() for name in names.split(',')])

5 самых популярных актеров (в фильмах и сериалах):

In [16]:
def get_actors_count(dataframe):
    return pd.Series([actor for movie_cast in dataframe['cast'] for actor in movie_cast]).value_counts()

In [17]:
get_actors_count(data)[:5]

Anupam Kher         41
Shah Rukh Khan      35
Om Puri             30
Naseeruddin Shah    30
Akshay Kumar        29
dtype: int64

5 самых популярных актеров сериалов:

In [150]:
shows_only = data[data['type'] == 'TV Show']
get_actors_count(shows_only)[:5]

Takahiro Sakurai    22
Yuki Kaji           17
Ai Kayano           16
Daisuke Ono         16
Junichi Suwabe      15
dtype: int64

5 самых популярных актеров фильмов:

In [151]:
movies_only = data[data['type'] == 'Movie']
get_actors_count(movies_only)[:5]

Anupam Kher         41
Shah Rukh Khan      35
Naseeruddin Shah    30
Om Puri             30
Akshay Kumar        29
dtype: int64

Посмотрим самых популярных актеров в разных жанрых:

In [152]:
data['listed_in'] = data['listed_in'].map(lambda genres: [genre.strip() for genre in genres.split(',')])

In [153]:
unpacked_genres = data.explode('listed_in')

In [154]:
unpacked_genres.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"[João Miguel, Bianca Comparato, Michel Gomes, ...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,International TV Shows,In a future where the elite inhabit an island ...
0,s1,TV Show,3%,,"[João Miguel, Bianca Comparato, Michel Gomes, ...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,TV Dramas,In a future where the elite inhabit an island ...
0,s1,TV Show,3%,,"[João Miguel, Bianca Comparato, Michel Gomes, ...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,TV Sci-Fi & Fantasy,In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"[Demián Bichir, Héctor Bonilla, Oscar Serrano,...",Mexico,"December 23, 2016",2016,TV-MA,93 min,Dramas,After a devastating earthquake hits Mexico Cit...
1,s2,Movie,7:19,Jorge Michel Grau,"[Demián Bichir, Héctor Bonilla, Oscar Serrano,...",Mexico,"December 23, 2016",2016,TV-MA,93 min,International Movies,After a devastating earthquake hits Mexico Cit...


Все жанры, представленные в наших данных:

In [155]:
unpacked_genres['listed_in'].unique()

array(['International TV Shows', 'TV Dramas', 'TV Sci-Fi & Fantasy',
       'Dramas', 'International Movies', 'Horror Movies',
       'Action & Adventure', 'Independent Movies', 'Sci-Fi & Fantasy',
       'TV Mysteries', 'Thrillers', 'Crime TV Shows', 'Documentaries',
       'Sports Movies', 'Comedies', 'Anime Series', 'TV Comedies',
       'Romantic Movies', 'Romantic TV Shows', 'Docuseries',
       'Science & Nature TV', 'Movies', 'British TV Shows', 'Reality TV',
       'Korean TV Shows', 'Music & Musicals', 'LGBTQ Movies', "Kids' TV",
       'TV Action & Adventure', 'Spanish-Language TV Shows',
       'Children & Family Movies', 'TV Shows', 'Classic Movies',
       'Cult Movies', 'TV Horror', 'Stand-Up Comedy & Talk Shows',
       'Teen TV Shows', 'Stand-Up Comedy', 'Anime Features',
       'TV Thrillers', 'Faith & Spirituality', 'Classic & Cult TV'],
      dtype=object)

Например, 5 самых популярных актеров в жанре комедий:

In [156]:
get_actors_count(unpacked_genres[unpacked_genres['listed_in'] == 'Comedies'])[:5]

Anupam Kher       19
Paresh Rawal      17
Adam Sandler      17
Shah Rukh Khan    16
Boman Irani       15
dtype: int64

Или научная фантастика:

In [157]:
get_actors_count(unpacked_genres[unpacked_genres['listed_in'] == 'Sci-Fi & Fantasy'])[:5]

Nicolas Cage          6
Laurence Fishburne    6
Fred Tatasciore       6
Paul Bettany          6
Hugo Weaving          5
dtype: int64