In [117]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations

In [118]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
684,tt0800240,25000000,17741298,Deception,Hugh Jackman|Ewan McGregor|Michelle Williams|B...,Marcel Langenegger,"When you're in this world, no one is who they ...",As a corporate auditor who works in a number o...,107,Drama|Thriller|Crime|Mystery,Rifkin-Eberts|Dune Entertainment III|Seed Prod...,4/24/2008,5.8,2008
1675,tt0465234,130000000,457363168,National Treasure: Book of Secrets,Nicolas Cage|Jon Voight|Harvey Keitel|Ed Harri...,Jon Turteltaub,The Greatest Adventure History Has Ever Revealed,Benjamin Franklin Gates and Dr. Abigail Chase ...,124,Action|Adventure|Mystery|Thriller,Jerry Bruckheimer Films|Saturn Films|Walt Disn...,12/13/2007,6.0,2007
1686,tt0822854,61000000,95696996,Shooter,Mark Wahlberg|Michael PeÃ±a|Danny Glover|Kate ...,Antoine Fuqua,Yesterday was about honor. Today is about just...,A marksman living in exile is coaxed back into...,124,Action|Drama|Mystery|Thriller|Crime,Paramount Pictures|Di Bonaventura Pictures|Gro...,3/22/2007,6.9,2007
1312,tt0330373,150000000,895921036,Harry Potter and the Goblet of Fire,Daniel Radcliffe|Rupert Grint|Emma Watson|Ralp...,Mike Newell,Dark And Difficult Times Lie Ahead.,"Harry starts his fourth year at Hogwarts, comp...",157,Adventure|Fantasy|Family,Patalex IV Productions Limited|Warner Bros.|He...,11/5/2005,7.3,2005
758,tt1568911,66000000,177584879,War Horse,Tom Hiddleston|Benedict Cumberbatch|Toby Kebbe...,Steven Spielberg,Separated by War. Tested by Battle. Bound by F...,Follows a young man named Albert and his horse...,146,Drama|War,DreamWorks SKG|Amblin Entertainment|Reliance E...,12/25/2011,6.9,2011


In [119]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


# Предобработка / Data Preprocessing

In [120]:
answers = {}  # A dictionary for answers is created

In [121]:
# String format of cells which may contain several names is
# changed to list.
data['genres'] = data['genres'].apply(lambda x: x.split('|'))
data['cast'] = data['cast'].apply(lambda x: x.split('|'))
data['director'] = data['director'].apply(lambda x: x.split('|'))
data['production_companies'] = data['production_companies'].apply(
    lambda x: x.split('|'))

# Release date is given in string format, hence it is changed
# to datetime format.
data['release_date'] = pd.to_datetime(data['release_date'])

# Actors in cast column are sorted in alphabetical order
# to apply combinations function next.
# This is required since combinations function treats
# elements as unique based on their position, not on their value.
data['cast'] = data['cast'].apply(lambda x: sorted(x))

In [122]:
# Several new columns are created for further analysis.
data['profit'] = data['revenue'] - data['budget']
data['release_month'] = pd.DatetimeIndex(data['release_date']).month
data['title_length'] = data['original_title'].apply(lambda x: len(x))
data['overview_length'] = data['overview'].apply(lambda x: len(x.split()))
data['pairs'] = data['cast'].apply(lambda x: list(combinations(x, 2)))

In [123]:
data.sample(5)  # sample of the updated dataset with amended and new columns

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
491,tt1320239,10000000,4349187,Burke & Hare,"[Andy Serkis, Isla Fisher, Simon Pegg, Tim Cur...",[John Landis],No Job Too Small. No Body Too Big. No Question...,"Two 19th-century opportunists (Simon Pegg, And...",91,"[Comedy, Thriller]","[Ealing Studios, Fragile Films, Quickfire Film...",2010-10-29,6.0,2010,-5650813,10,12,25,"[(Andy Serkis, Isla Fisher), (Andy Serkis, Sim..."
1781,tt0841046,35000000,18317151,Walk Hard: The Dewey Cox Story,"[Jenna Fischer, John C. Reilly, Kristen Wiig, ...",[Jake Kasdan],Life made him tough. Love made him strong. Mus...,Singer Dewey Cox overcomes adversity to become...,96,"[Comedy, Music]","[Columbia Pictures Corporation, Apatow Product...",2007-12-21,6.5,2007,-16682849,12,30,10,"[(Jenna Fischer, John C. Reilly), (Jenna Fisch..."
269,tt1114740,26000000,183293131,Paul Blart: Mall Cop,"[Jayma Mays, Keir O'Donnell, Kevin James, Rain...",[Steve Carr],Safety Never Takes A Holiday.,Mild-mannered Paul Blart (Kevin James) has alw...,91,"[Action, Adventure, Comedy, Family]","[Columbia Pictures, Happy Madison Productions,...",2009-01-15,5.4,2009,157293131,1,20,75,"[(Jayma Mays, Keir O'Donnell), (Jayma Mays, Ke..."
665,tt0852713,25000000,70442940,The House Bunny,"[Anna Faris, Emma Stone, Kat Dennings, Kathari...",[Fred Wolf],For the girls of ZETA house college life was n...,Shelley is living a carefree life until a riva...,97,"[Romance, Comedy]","[Columbia Pictures, Happy Madison Productions,...",2008-08-22,5.6,2008,45442940,8,15,108,"[(Anna Faris, Emma Stone), (Anna Faris, Kat De..."
1587,tt0356634,50000000,200804534,Garfield,"[Bill Murray, Breckin Meyer, Evan Arnold, Jenn...",[Peter Hewitt],Get ready for frisky business.,"Garfield, the fat, lazy, lasagna lover, has ev...",80,"[Animation, Comedy, Family]","[Twentieth Century Fox Film Corporation, Davis...",2004-06-10,5.2,2004,150804534,6,8,49,"[(Bill Murray, Breckin Meyer), (Bill Murray, E..."


In [124]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   imdb_id               1889 non-null   object        
 1   budget                1889 non-null   int64         
 2   revenue               1889 non-null   int64         
 3   original_title        1889 non-null   object        
 4   cast                  1889 non-null   object        
 5   director              1889 non-null   object        
 6   tagline               1889 non-null   object        
 7   overview              1889 non-null   object        
 8   runtime               1889 non-null   int64         
 9   genres                1889 non-null   object        
 10  production_companies  1889 non-null   object        
 11  release_date          1889 non-null   datetime64[ns]
 12  vote_average          1889 non-null   float64       
 13  release_year      

# 1. У какого фильма из списка самый большой бюджет?
### Which movie has the biggest budget?

In [125]:
answers['1'] = 'Pirates of the Caribbean: On Stranger Tides (tt1298650)'

In [126]:
data[data.budget == data.budget.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,"[Geoffrey Rush, Ian McShane, Johnny Depp, Kevi...",[Rob Marshall],Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,"[Adventure, Action, Fantasy]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2011-05-11,6.3,2011,641683000,5,43,75,"[(Geoffrey Rush, Ian McShane), (Geoffrey Rush,..."


# 2. Какой из фильмов самый длительный (в минутах)?
### Which movie is the longest (in munites)?

In [127]:
answers['2'] = 'Gods and Generals (tt0279111)'

In [128]:
data[data.runtime == data.runtime.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
1157,tt0279111,56000000,12923936,Gods and Generals,"[C. Thomas Howell, Jeff Daniels, Kevin Conway,...",[Ronald F. Maxwell],The nations heart was touched by...,The film centers mostly around the personal an...,214,"[Drama, History, War]","[Turner Pictures, Antietam Filmworks]",2003-02-21,5.8,2003,-43076064,2,17,48,"[(C. Thomas Howell, Jeff Daniels), (C. Thomas ..."


# 3. Какой из фильмов самый короткий (в минутах)?
### Which movie is the shortest (in minutes)?

In [129]:
answers['3'] = 'Winnie the Pooh (tt1449283)'

In [130]:
data[data.runtime == data.runtime.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
768,tt1449283,30000000,14460000,Winnie the Pooh,"[Bud Luckey, Craig Ferguson, Jim Cummings, Jim...","[Stephen Anderson, Don Hall]",Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,"[Animation, Family]","[Walt Disney Pictures, Walt Disney Animation S...",2011-04-13,6.8,2011,-15540000,4,15,52,"[(Bud Luckey, Craig Ferguson), (Bud Luckey, Ji..."


# 4. Какова средняя длительность фильмов?
### What is the average movie length?

In [131]:
answers['4'] = 110

In [132]:
round(data.runtime.mean())

110

# 5. Каково медианное значение длительности фильмов? 
### What is the meadian movie length?

In [133]:
answers['5'] = 107

In [134]:
round(data.runtime.median())

107

# 6. Какой самый прибыльный фильм?
### What is the most profitable movie?

In [135]:
answers['6'] = 'Avatar (tt0499549)'

In [136]:
data[data.profit == data.profit.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
239,tt0499549,237000000,2781505847,Avatar,"[Michelle Rodriguez, Sam Worthington, Sigourne...",[James Cameron],Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,7.1,2009,2544505847,12,6,28,"[(Michelle Rodriguez, Sam Worthington), (Miche..."


# 7. Какой фильм самый убыточный? 
### What is the least profitable movie?

In [137]:
answers['7'] = 'The Lone Ranger (tt1210819)'

In [138]:
data[data.profit == data.profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
1245,tt1210819,255000000,89289910,The Lone Ranger,"[Armie Hammer, Helena Bonham Carter, James Bad...",[Gore Verbinski],Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,-165710090,7,15,68,"[(Armie Hammer, Helena Bonham Carter), (Armie ..."


# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?
### What is the number of movies which made more in the box office than they had for its budget?

In [139]:
answers['8'] = 1478

In [140]:
len(data[data.revenue > data.budget])

1478

# 9. Какой фильм оказался самым кассовым в 2008 году?
### Which movie had the biggest box office in 2008?

In [141]:
answers['9'] = 'The Dark Knight (tt0468569)'

In [142]:
data[data.revenue == data[data.release_year == 2008].revenue.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
599,tt0468569,185000000,1001921825,The Dark Knight,"[Aaron Eckhart, Christian Bale, Gary Oldman, H...",[Christopher Nolan],Why So Serious?,Batman raises the stakes in his war on crime. ...,152,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Warner Bros., ...",2008-07-16,8.1,2008,816921825,7,15,67,"[(Aaron Eckhart, Christian Bale), (Aaron Eckha..."


# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?
### Which movie was the most unprofitable for the period 2012-2014 inclusive?

In [143]:
answers['10'] = 'The Lone Ranger (tt1210819)'

In [144]:
data[data.profit == data[(data.release_year > 2011) &
                         (data.release_year < 2015)].profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
1245,tt1210819,255000000,89289910,The Lone Ranger,"[Armie Hammer, Helena Bonham Carter, James Bad...",[Gore Verbinski],Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,-165710090,7,15,68,"[(Armie Hammer, Helena Bonham Carter), (Armie ..."


# 11. Какого жанра фильмов больше всего?
### Which genre is the most popular?

In [145]:
answers['11'] = 'Drama'

VARIANT 1

In [146]:
# Since one movie can have several genres assigned to it,
# 'genres' column needs to be exploded.
# Five most common genres are displayed to consider
# a possibility that several genres have equal frequency.
Counter(data.explode('genres')['genres']).most_common(5)

[('Drama', 782),
 ('Comedy', 683),
 ('Thriller', 596),
 ('Action', 582),
 ('Adventure', 415)]

VARIANT 2

In [147]:
# Second option is to use value_counts function instead of Counter tool.
data.explode('genres')['genres'].value_counts().head(5)

Drama        782
Comedy       683
Thriller     596
Action       582
Adventure    415
Name: genres, dtype: int64

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 
### Movies of which genre are the most profitable?

In [148]:
answers['12'] = 'Drama'

VARIANT 1

In [149]:
# Column with genres is exploded, since a movie can be assigned several genres.
# Movies with positive profit are considered as profitable.
Counter(data[data.profit > 0].explode('genres')['genres']).most_common(5)

[('Drama', 560),
 ('Comedy', 551),
 ('Thriller', 446),
 ('Action', 444),
 ('Adventure', 337)]

VARIANT 2

In [150]:
# Using value_counts function instead of Counter tool
data[data.profit > 0].explode('genres')['genres'].value_counts().head(5)

Drama        560
Comedy       551
Thriller     446
Action       444
Adventure    337
Name: genres, dtype: int64

# 13. У какого режиссера самые большие суммарные кассовые сборы?
### Who is the highest grossing director?

In [151]:
answers['13'] = 'Peter Jackson'

In [152]:
# Column with directors is exploded, since a movie can have several directors.
# Summed up revenue, or box office, for all movies shot by each director
# is calculated.
# Top one in the sorted list is the highest grossing director.
data.explode('director').groupby(['director'])[
    'revenue'].sum().sort_values(ascending=False).head(5)

director
Peter Jackson        6490593685
Christopher Nolan    4167548502
David Yates          4154295625
Michael Bay          3886938960
J.J. Abrams          3579169916
Name: revenue, dtype: int64

# 14. Какой режисер снял больше всего фильмов в стиле Action?
### Which director filmed the largest number of action genre movies?

In [153]:
answers['14'] = 'Robert Rodriguez'

VARIANT 1

In [154]:
data_dir_genre = data.explode('director').explode(
    'genres')  # director-focused datased (one director per line)

# Before being sorted by director, dataset is filtered by Action-genre.
Counter(data_dir_genre[data_dir_genre['genres']
                       == 'Action']['director']).most_common(5)

[('Robert Rodriguez', 9),
 ('Michael Bay', 7),
 ('Paul W.S. Anderson', 7),
 ('Antoine Fuqua', 6),
 ('Ridley Scott', 6)]

VARIANT 2

In [155]:
data_dir_genre = data.explode('director').explode(
    'genres')  # one director per line

# Using value_counts function instead of Counter tool.
# (1) Filter by Action genre, (2) sort by director.
data_dir_genre.loc[data_dir_genre['genres'] ==
                   'Action']['director'].value_counts().head(5)

Robert Rodriguez      9
Paul W.S. Anderson    7
Michael Bay           7
Antoine Fuqua         6
Ridley Scott          6
Name: director, dtype: int64

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 
### Who was the highest grossing actor in 2012?

In [156]:
answers['15'] = 'Chris Hemsworth'

In [157]:
# Cast-focused dataset (one line has one actor) for the year 2012
cast_2012 = data.explode('cast')[data.explode('cast')['release_year'] == 2012]

# The actor with the highest revenue (box office) is considered
# the highest grossing one.
cast_2012.groupby(['cast'])['revenue'].sum(
).sort_values(ascending=False).head(5)

cast
Chris Hemsworth      2027450773
Denis Leary          1629460639
Anne Hathaway        1522851057
Chris Evans          1519557910
Robert Downey Jr.    1519557910
Name: revenue, dtype: int64

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?
### Which actor has starred in the largest number of big budget films?

In [158]:
answers['16'] = 'Matt Damon'

In [159]:
# For the purpose of this task, films with budgets higher than
# the average one are considered big budget.
data.explode('cast')[data.explode('cast')['budget'] >
                     data.budget.mean()]['cast'].value_counts().head(5)

Matt Damon           18
Adam Sandler         17
Angelina Jolie       16
Tom Cruise           15
Samuel L. Jackson    15
Name: cast, dtype: int64

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 
### What is the top genre for movies with Nicolas Cage?

In [160]:
answers['17'] = 'Action'

In [161]:
# dataset with one actor per line
data_cast_genre = data.explode('cast').explode('genres')

# New dataset is filtered by Nicolas Cage and sorted by genres.
data_cast_genre[data_cast_genre['cast'] ==
                'Nicolas Cage']['genres'].value_counts().head(5)

Action      17
Thriller    15
Drama       12
Crime       10
Fantasy      8
Name: genres, dtype: int64

# 18. Самый убыточный фильм от Paramount Pictures
### Which film by Paramount Pictures is the least profitable?

In [162]:
answers['18'] = 'K-19: The Widowmaker (tt0267626)'

In [163]:
# Dataset with one production comany per line is created and
# filtered by 'Paramount Pictures'.
data_pp = data.explode('production_companies')[data.explode(
    'production_companies')['production_companies'] == 'Paramount Pictures']

data_pp[data_pp.profit == data_pp.profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
925,tt0267626,100000000,35168966,K-19: The Widowmaker,"[Harrison Ford, John Shrapnel, Joss Ackland, L...",[Kathryn Bigelow],Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,"[Thriller, Drama, History]",Paramount Pictures,2002-07-19,6.0,2002,-64831034,7,20,23,"[(Harrison Ford, John Shrapnel), (Harrison For..."


# 19. Какой год стал самым успешным по суммарным кассовым сборам?
### Which year yielded the highest revenue?

In [164]:
answers['19'] = 2015

In [165]:
# Revenues should be summed up per release year to find the most
# successful one.
data.groupby(['release_year'])['revenue'].sum(
).sort_values(ascending=False).head(5)

release_year
2015    25449202382
2014    23405862953
2013    23213799791
2012    23079001687
2011    22676791872
Name: revenue, dtype: int64

# 20. Какой самый прибыльный год для студии Warner Bros?
### Which year was the most profitable for Warner Bros?

In [166]:
answers['20'] = 2014

In [167]:
# Since Warner Bros comprises several business entities (like
# Warner Bros. Animation or Warner Bros. Pictures), new dataset
# includes all films containing 'Warner Bros' in production co. cell.
data_wb = data.explode('production_companies')[data.explode(
    'production_companies')['production_companies'].str.contains('Warner Bros')]

data_wb.groupby(['release_year'])['profit'].sum(
).sort_values(ascending=False).head(5)

release_year
2014    2292949646
2007    2201675217
2008    2134595031
2010    1974712985
2011    1871393682
Name: profit, dtype: int64

In [168]:
# for information only, WB entities
data_wb.production_companies.unique()

array(['Warner Bros.', 'Warner Bros. Animation',
       'Warner Bros. Interactive Entertainment', 'Warner Bros. Pictures'],
      dtype=object)

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?
### Which month has the largest number of movies released?

In [169]:
answers['21'] = 9

In [170]:
# Release_month column was created from release_date during
# data preprocessing.
data.release_month.value_counts().head(5)

9     227
12    190
10    186
8     161
3     156
Name: release_month, dtype: int64

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)
### How many movies were released during summer months (June, July, August)?

In [171]:
answers['22'] = 450

In [172]:
# Number of lines is calculated for months 6-8 (Jun-Aug).
len(data[(data.release_month == 6) | (
    data.release_month == 7) | (data.release_month == 8)])

450

# 23. Для какого режиссера зима – самое продуктивное время года? 
### For which director winter is the most productive season?

In [173]:
answers['23'] = 'Peter Jackson'

In [174]:
data_dir = data.explode('director')  # dataset with one director per line

# New dataset is filtered by witer months and sorted by director name.
data_dir[(data_dir.release_month == 12) | (data_dir.release_month == 1) | (
    data_dir.release_month == 2)]['director'].value_counts().head(5)

Peter Jackson        7
Steven Soderbergh    6
Clint Eastwood       6
Nancy Meyers         4
Martin Scorsese      4
Name: director, dtype: int64

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?
### Which production company gives its films the longest titles in symbols?

In [175]:
answers['24'] = 'Four By Two Productions'

In [176]:
# dataset with one production company per line
data_prod_co = data.explode('production_companies')

# New column with title length in symbols was created during
# data preprocessing.
# Sorted average title lengths for each prod co are displayed.
data_prod_co.groupby('production_companies')[
    'title_length'].mean().sort_values(ascending=False).head(5)

production_companies
Four By Two Productions      83.0
Jim Henson Company, The      59.0
Dos Corazones                47.0
Museum Canada Productions    46.0
Polsky Films                 46.0
Name: title_length, dtype: float64

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?
### Which production company gives its films the longest overview in words?

In [177]:
answers['25'] = 'Midnight Picture Show'

In [178]:
# dataset with one production company per line
data_prod_co = data.explode('production_companies')

# New column with overview length in words was created during
# data preprocessing.
# Sorted average overview lengths for each prod co are displayed.
data_prod_co.groupby('production_companies')[
    'overview_length'].mean().sort_values(ascending=False).head(5)

production_companies
Midnight Picture Show               175.0
Room 9 Entertainment                161.0
98 MPH Productions                  159.0
Heineken Branded Entertainment      159.0
Brookwell-McNamara Entertainment    156.0
Name: overview_length, dtype: float64

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
### Which films are in the top 1 percent per average rating?

In [179]:
answers['26'] = '1. Inside Out, The Dark Knight, 12 Years a Slave'

In [180]:
# (1) sort by vote_average column;
# (2) divide the number of all films by 100 to display the top one percent
data.sort_values('vote_average', ascending=False).head(int(len(data)/100))

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_length,pairs
599,tt0468569,185000000,1001921825,The Dark Knight,"[Aaron Eckhart, Christian Bale, Gary Oldman, H...",[Christopher Nolan],Why So Serious?,Batman raises the stakes in his war on crime. ...,152,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Warner Bros., ...",2008-07-16,8.1,2008,816921825,7,15,67,"[(Aaron Eckhart, Christian Bale), (Aaron Eckha..."
118,tt0816692,165000000,621752480,Interstellar,"[Anne Hathaway, Casey Affleck, Jessica Chastai...",[Christopher Nolan],Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,169,"[Adventure, Drama, Science Fiction]","[Paramount Pictures, Legendary Pictures, Warne...",2014-11-05,8.0,2014,456752480,11,12,35,"[(Anne Hathaway, Casey Affleck), (Anne Hathawa..."
125,tt2084970,14000000,233555708,The Imitation Game,"[Allen Leech, Benedict Cumberbatch, Keira Knig...",[Morten Tyldum],The true enigma was the man who cracked the code.,Based on the real life story of legendary cryp...,113,"[History, Drama, Thriller, War]","[Black Bear Pictures, Bristol Automotive]",2014-11-14,8.0,2014,219555708,11,18,46,"[(Allen Leech, Benedict Cumberbatch), (Allen L..."
9,tt2096673,175000000,853708609,Inside Out,"[Amy Poehler, Bill Hader, Lewis Black, Phyllis...",[Pete Docter],Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",94,"[Comedy, Animation, Family]","[Walt Disney Pictures, Pixar Animation Studios...",2015-06-09,8.0,2015,678708609,6,10,112,"[(Amy Poehler, Bill Hader), (Amy Poehler, Lewi..."
34,tt3170832,6000000,35401758,Room,"[Brie Larson, Jacob Tremblay, Joan Allen, Sean...",[Lenny Abrahamson],Love knows no boundaries,Jack is a young boy of 5 years old who has liv...,117,"[Drama, Thriller]","[Element Pictures, No Trace Camping, A24, Dupe...",2015-10-16,8.0,2015,29401758,10,4,49,"[(Brie Larson, Jacob Tremblay), (Brie Larson, ..."
1183,tt0993846,100000000,392000694,The Wolf of Wall Street,"[Jonah Hill, Kyle Chandler, Leonardo DiCaprio,...",[Martin Scorsese],EARN. SPEND. PARTY.,A New York stockbroker refuses to cooperate in...,180,"[Crime, Drama, Comedy]","[Paramount Pictures, Appian Way, EMJAG Product...",2013-12-25,7.9,2013,292000694,12,23,29,"[(Jonah Hill, Kyle Chandler), (Jonah Hill, Leo..."
128,tt2267998,61000000,369330363,Gone Girl,"[Ben Affleck, Carrie Coon, Neil Patrick Harris...",[David Fincher],You don't know what you've got 'til it's...,With his wife's disappearance having become th...,145,"[Mystery, Thriller, Drama]","[Twentieth Century Fox Film Corporation, Regen...",2014-10-01,7.9,2014,308330363,10,9,30,"[(Ben Affleck, Carrie Coon), (Ben Affleck, Nei..."
1191,tt2024544,20000000,187000000,12 Years a Slave,"[Benedict Cumberbatch, Chiwetel Ejiofor, Lupit...",[Steve McQueen],The extraordinary true story of Solomon Northup,"In the pre-Civil War United States, Solomon No...",134,"[Drama, History]","[Plan B Entertainment, Regency Enterprises, Ri...",2013-10-18,7.9,2013,167000000,10,16,61,"[(Benedict Cumberbatch, Chiwetel Ejiofor), (Be..."
119,tt2015381,170000000,773312399,Guardians of the Galaxy,"[Bradley Cooper, Chris Pratt, Dave Bautista, V...",[James Gunn],All heroes start somewhere.,"Light years from Earth, 26 years after being a...",121,"[Action, Science Fiction, Adventure]","[Marvel Studios, Moving Picture Company (MPC),...",2014-07-30,7.9,2014,603312399,7,23,28,"[(Bradley Cooper, Chris Pratt), (Bradley Coope..."
1081,tt0167260,94000000,1118888979,The Lord of the Rings: The Return of the King,"[Elijah Wood, Ian McKellen, Liv Tyler, Orlando...",[Peter Jackson],The eye of the enemy is moving.,Aragorn is revealed as the heir to the ancient...,201,"[Adventure, Fantasy, Action]","[WingNut Films, New Line Cinema]",2003-12-01,7.9,2003,1024888979,12,45,45,"[(Elijah Wood, Ian McKellen), (Elijah Wood, Li..."


# 27. Какие актеры чаще всего снимаются в одном фильме вместе?
### Which actors star in same movies most often? 

In [181]:
answers['27'] = 'Daniel Radcliffe & Rupert Grint / Daniel Radcliffe & Emma Watson / Emma Watson & Rupert Grint'

In [182]:
# Actors in cast column were sorted in alphabetical order during data
# preprocessing to avoid duplications.
# Pairs of actors from respective column (with pairs of actors for each film),
# created during data preprocessing, are sorted by frequency.

Counter(data.explode('pairs')['pairs']).most_common(5)

[(('Daniel Radcliffe', 'Emma Watson'), 8),
 (('Daniel Radcliffe', 'Rupert Grint'), 8),
 (('Emma Watson', 'Rupert Grint'), 8),
 (('Ben Stiller', 'Owen Wilson'), 6),
 (('Helena Bonham Carter', 'Johnny Depp'), 6)]

# Submission

In [183]:
answers

{'1': 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': 'Gods and Generals (tt0279111)',
 '3': 'Winnie the Pooh (tt1449283)',
 '4': 110,
 '5': 107,
 '6': 'Avatar (tt0499549)',
 '7': 'The Lone Ranger (tt1210819)',
 '8': 1478,
 '9': 'The Dark Knight (tt0468569)',
 '10': 'The Lone Ranger (tt1210819)',
 '11': 'Drama',
 '12': 'Drama',
 '13': 'Peter Jackson',
 '14': 'Robert Rodriguez',
 '15': 'Chris Hemsworth',
 '16': 'Matt Damon',
 '17': 'Action',
 '18': 'K-19: The Widowmaker (tt0267626)',
 '19': 2015,
 '20': 2014,
 '21': 9,
 '22': 450,
 '23': 'Peter Jackson',
 '24': 'Four By Two Productions',
 '25': 'Midnight Picture Show',
 '26': '1. Inside Out, The Dark Knight, 12 Years a Slave',
 '27': 'Daniel Radcliffe & Rupert Grint / Daniel Radcliffe & Emma Watson / Emma Watson & Rupert Grint'}

In [184]:
len(answers)

27