In [180]:
import numpy as np
import pandas as pd

In [181]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
1044,tt1615065,45000000,47000000,Savages,Blake Lively|Taylor Kitsch|Aaron Taylor-Johnso...,Oliver Stone,Young Beautiful Deadly.,Pot growers Ben and Chon face off against the ...,131,Crime|Drama|Thriller,Ixtlan|Relativity Media|Onda Entertainment,7/6/2012,6.2,2012
779,tt1217613,70000000,202466756,Battle: Los Angeles,Aaron Eckhart|RamÃ³n RodrÃ­guez|Will Rothhaar|...,Jonathan Liebesman,It's not war. It's survival.,The Earth is attacked by unknown forces. As pe...,116,Action|Science Fiction,Columbia Pictures|Original Film|Relativity Media,3/8/2011,5.5,2011
994,tt1764651,100000000,312573423,The Expendables 2,Sylvester Stallone|Jason Statham|Dolph Lundgre...,Simon West,Back for War.,Mr. Church reunites the Expendables for what s...,103,Action|Adventure|Thriller,Nu Image Films|Millennium Films,8/8/2012,6.0,2012
405,tt1037705,80000000,157107755,The Book of Eli,Denzel Washington|Gary Oldman|Michael Gambon|M...,Albert Hughes|Allen Hughes,Some will kill to have it. He will kill to pro...,"A post-apocalyptic tale, in which a lone man f...",118,Action|Thriller|Science Fiction,Alcon Entertainment|Silver Pictures,1/14/2010,6.5,2010
199,tt0787474,60000000,108255770,The Boxtrolls,Ben Kingsley|Isaac Hempstead Wright|Elle Fanni...,Graham Annable|Anthony Stacchi,When troubles strikes friends stack together,An orphaned boy raised by underground creature...,97,Animation|Comedy|Family|Fantasy,Laika Entertainment,9/10/2014,6.6,2014


In [182]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


# Предобработка
\### Preprocessing ###

In [183]:
answers = {} # Dictionary for answers

# String template for movies in format "Movie Title (imdb_id)"
str_movie_title = '{} ({})'

# Function to fetch title and imdb_id from the Pandas Series
# * It is supposed, the Series contains only one line!
# * Returns formatted string
def get_movie(in_df):
  if (in_df.shape[0] == 1):
    return str_movie_title.format(in_df.original_title.iloc[0], in_df.iloc[0].imdb_id)
  else:
    return 'ERROR: DataFrame contains more than one Series!'

# Function to explode DataFrame by specific column
# * Returns transformed DataFrame
def explode_df(in_df, column, separator='|'):
  new_df = in_df.copy()
  new_df[column] = new_df[column].str.split(separator)
  return new_df.explode(column)

# Calculating profit as (revenue - budget) and appending related coulumn:
data['profit'] = data.revenue - data.budget


# Months dictionary :)
months = {1: 'Январь',
          2: 'Февраль',
          3: 'Март',
          4: 'Апрель',
          5: 'Май',
          6: 'Июнь',
          7: 'Июль',
          8: 'Август',
          9: 'Сентябрь',
          10: 'Октябрь',
          11: 'Ноябрь',
          12: 'Декабрь'}

# 1. У какого фильма из списка самый большой бюджет?
<font color="blue">Which movie has the highest budget?</font>

In [184]:
# Filter by max budget value
with_max_budget = data[data.budget == data.budget.max()]
display(with_max_budget)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,5/11/2011,6.3,2011,641683000


In [185]:
answers['1'] = get_movie(with_max_budget)

<font color="blue">OPTION 2</font>

In [186]:
# Sort descending, then take first
data.sort_values('budget', ascending=False).head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,5/11/2011,6.3,2011,641683000


# 2. Какой из фильмов самый длительный (в минутах)?

<font color="blue">Which movie has the highest running length?</font>

In [187]:
with_max_runtime = data[data.runtime == data.runtime.max()]
display(with_max_runtime)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1157,tt0279111,56000000,12923936,Gods and Generals,Stephen Lang|Jeff Daniels|Robert Duvall|Kevin ...,Ronald F. Maxwell,The nations heart was touched by...,The film centers mostly around the personal an...,214,Drama|History|War,Turner Pictures|Antietam Filmworks,2/21/2003,5.8,2003,-43076064


In [188]:
answers['2'] = get_movie(with_max_runtime)

# 3. Какой из фильмов самый короткий (в минутах)?

<font color="blue">Which movie has the lowest running length?</font>



In [189]:
with_min_runtime = data[data.runtime == data.runtime.min()]
display(with_min_runtime)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
768,tt1449283,30000000,14460000,Winnie the Pooh,Jim Cummings|Travis Oates|Jim Cummings|Bud Luc...,Stephen Anderson|Don Hall,Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,Animation|Family,Walt Disney Pictures|Walt Disney Animation Stu...,4/13/2011,6.8,2011,-15540000


In [190]:
answers['3'] = get_movie(with_min_runtime)

# 4. Какова средняя длительность фильмов?

<font color="blue">What is the average movies running length?</font>

In [191]:
# Call mean by runtime and round (to match the suggested answers)
runtime_mean = data.runtime.mean()
print(runtime_mean)

109.6585494970884


In [192]:
answers['4'] = round(runtime_mean)

# 5. Каково медианное значение длительности фильмов?

<font color="blue">What is the median value of movies running length?</font>

In [193]:
runtime_median = data.runtime.agg('median')
print(runtime_median)

107.0


In [194]:
answers['5'] = round(runtime_median)

# ------------------------------------------------------------------------------

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget)

<font color="blue">Which movie is the most profitable?</font>

In [195]:
most_profitable = data[data.profit == data.profit.max()]
display(most_profitable)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
239,tt0499549,237000000,2781505847,Avatar,Sam Worthington|Zoe Saldana|Sigourney Weaver|S...,James Cameron,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,Action|Adventure|Fantasy|Science Fiction,Ingenious Film Partners|Twentieth Century Fox ...,12/10/2009,7.1,2009,2544505847


In [196]:
answers['6'] = get_movie(most_profitable)

# 7. Какой фильм самый убыточный?

<font color="blue">Which movie is the most unprofitable</font>

In [197]:
most_unprofitable = data[data.profit == data.profit.min()]
display(most_unprofitable)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,7/3/2013,6.0,2013,-165710090


In [198]:
answers['7'] = get_movie(most_unprofitable)

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

<font color="blue">How many movies' revenue appeared to be higher than budget?</font>

In [199]:
# Filter by condition and count ids
count_revenue_gt_budget = data[data.revenue > data.budget].imdb_id.count()
print(count_revenue_gt_budget)

1478


In [200]:
answers['8'] = count_revenue_gt_budget

# 9. Какой фильм оказался самым кассовым в 2008 году?

<font color="blue">Which was the highest grossing movie in 2008?</font>

In [201]:
# Get movies released in between 2008
released_2008 = data[data.release_year == 2008]
# Take with highest revenue
highest_box_office_in_2008 = released_2008[released_2008.revenue == released_2008.revenue.max()]
display(highest_box_office_in_2008)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,7/16/2008,8.1,2008,816921825


In [202]:
answers['9'] = get_movie(highest_box_office_in_2008)

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?

<font color="blue">Which was the most unprofitable movie in between 2012 and 2014 (inclusive)?</font>

In [203]:
# Get movies released in between 2012 and 2014
released_2012_2014 = data[(data.release_year >= 2012) & (data.release_year <= 2014)]
# Take with least profit
most_unprofitable_in_2012_2014 = released_2012_2014[released_2012_2014.profit == released_2012_2014.profit.min()]
display(most_unprofitable_in_2012_2014)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,7/3/2013,6.0,2013,-165710090


In [204]:
answers['10'] = get_movie(most_unprofitable_in_2012_2014)

# ------------------------------------------------------------------------------

# 11. Какого жанра фильмов больше всего?

<font color="blue">What is the most common genre of movies?</font>

In [205]:
# Take just genres and split all by vertical line (results in series of lists),
# then explode to series of all (even repeating) genres with replicated ids
# finally just count occurrences of each, answer is at the top
genres_count = data.genres.str.split('|').explode().value_counts()
display(genres_count)

Drama              782
Comedy             683
Thriller           596
Action             582
Adventure          415
Crime              315
Romance            308
Family             260
Science Fiction    248
Fantasy            222
Horror             176
Mystery            168
Animation          139
Music               64
History             62
War                 58
Western             19
Documentary          8
Foreign              2
Name: genres, dtype: int64

In [206]:
answers['11'] = genres_count.idxmax()

<font color="blue">OPTION 2</font>

In [207]:
# Create series from flat list, then count 
pd.Series(data.genres.str.cat(sep='|').split('|')).value_counts()

Drama              782
Comedy             683
Thriller           596
Action             582
Adventure          415
Crime              315
Romance            308
Family             260
Science Fiction    248
Fantasy            222
Horror             176
Mystery            168
Animation          139
Music               64
History             62
War                 58
Western             19
Documentary          8
Foreign              2
dtype: int64

# 12. Фильмы какого жанра чаще всего становятся прибыльными?

<font color="blue">Movies of which genre are the most profitable?</font>

In [208]:
# Same as previous, but take only movies with positive profit
profitable_genres = data[data.profit > 0].genres.str.split('|').explode().value_counts()
display(profitable_genres)

Drama              560
Comedy             551
Thriller           446
Action             444
Adventure          337
Romance            242
Crime              231
Family             226
Science Fiction    195
Fantasy            188
Horror             150
Animation          120
Mystery            119
Music               47
History             46
War                 41
Western             12
Documentary          7
Name: genres, dtype: int64

In [209]:
answers['12'] = profitable_genres.idxmax()

# 13. У какого режиссера самые большие суммарные кассовые сборы?

<font color="blue">Which movie director has the highest sum of revenue?</font>

In [210]:
# Using explode_df(...) function defined in ### Preprocessing ### section above,
# then group, summarize revenue and sort, the answer is at the top
directors_with_most_revenue = explode_df(data,'director').groupby('director').revenue.sum().sort_values(ascending=False)
display(directors_with_most_revenue)

director
Peter Jackson        6490593685
Christopher Nolan    4167548502
David Yates          4154295625
Michael Bay          3886938960
J.J. Abrams          3579169916
                        ...    
David MichÃ´d           2295423
Steven Shainberg        2281089
Paul Schrader           2062066
Keanu Reeves            2054941
Simon Hunter            2033165
Name: revenue, Length: 997, dtype: int64

In [211]:
answers['13'] = directors_with_most_revenue.idxmax()

# 14. Какой режиссер снял больше всего фильмов в стиле Action?

<font color="blue">Which movie director filmed the most movies of Action genre?</font>

In [212]:
# First, filter action movies only
action_movies = data[data.genres.str.contains('Action')]
# Now explode them by director, count and sort
action_directors = explode_df(action_movies,'director').groupby('director').size().sort_values(ascending=False)
display(action_directors)

director
Robert Rodriguez      9
Paul W.S. Anderson    7
Michael Bay           7
Ridley Scott          6
Antoine Fuqua         6
                     ..
Joe Cornish           1
Roger Spottiswoode    1
Jimmy Hayward         1
Jim Gillespie         1
Jon Avnet             1
Length: 364, dtype: int64

In [213]:
answers['14'] = action_directors.idxmax()

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году?

<font color="blue">Which actor brought the highest box-office gains in 2012?</font>

In [214]:
# Filter by 2012 relesae year, explode by cast, then group and sum revenue
revenue_with_actor_in_2012 = explode_df(data[data.release_year == 2012],'cast').groupby('cast').revenue.sum().sort_values(ascending=False)
display(revenue_with_actor_in_2012)

cast
Chris Hemsworth      2027450773
Denis Leary          1629460639
Anne Hathaway        1522851057
Chris Evans          1519557910
Robert Downey Jr.    1519557910
                        ...    
Jason Bateman           3428048
Danny Huston            2106557
Sami Gayle              2106557
Josh Lucas              2106557
Nicolas Cage            2106557
Name: revenue, Length: 466, dtype: int64

In [215]:
answers['15'] = revenue_with_actor_in_2012.idxmax()

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

<font color="blue">Which actor has starred in the most big-budget films?<br>
(note: in movies with budget higher than mid)
</font>

In [216]:
# First, get "big-budget" movies
bigbudget_movies = data[data.budget > data.budget.mean()]
# Explode by cast, then group and count
actors_counts_in_bigbudget_movies = explode_df(bigbudget_movies,'cast').groupby('cast').size().sort_values(ascending=False)
display(actors_counts_in_bigbudget_movies)

cast
Matt Damon           18
Adam Sandler         17
Angelina Jolie       16
Eddie Murphy         15
Samuel L. Jackson    15
                     ..
Leslie Bibb           1
Leonard Nimoy         1
Lennie James          1
Lena Olin             1
50 Cent               1
Length: 1505, dtype: int64

In [217]:
answers['16'] = actors_counts_in_bigbudget_movies.idxmax()

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage?

<font color="blue">In what genre has Nicolas Cage starred the most?</font>

In [218]:
# First, get movies where Nicolas Cage starred
movies_with_nicolas_cage = data[data.cast.str.contains('Nicolas Cage')]
# Now do as usual
genres_with_nicolas_cage = explode_df(movies_with_nicolas_cage,'genres').groupby('genres').size().sort_values(ascending=False)
display(genres_with_nicolas_cage)

genres
Action             17
Thriller           15
Drama              12
Crime              10
Fantasy             8
Adventure           7
Comedy              6
Science Fiction     4
Mystery             3
Family              3
Animation           3
History             2
War                 1
Romance             1
Horror              1
dtype: int64

In [219]:
answers['17'] = genres_with_nicolas_cage.idxmax()

# ------------------------------------------------------------------------------

# 18. Самый убыточный фильм от Paramount Pictures

<font color="blue">The most unprofitable movie of Paramount Pictures</font>

In [220]:
# Get Paramount Pictures movies
movies_by_paramount = data[data.production_companies.str.contains('Paramount Pictures')]
# Get min profit
most_unprofitable_by_paramount = movies_by_paramount[movies_by_paramount.profit == movies_by_paramount.profit.min()]
display(most_unprofitable_by_paramount)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
925,tt0267626,100000000,35168966,K-19: The Widowmaker,Harrison Ford|Liam Neeson|Peter Sarsgaard|Joss...,Kathryn Bigelow,Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,Thriller|Drama|History,Paramount Pictures|Intermedia Films|National G...,7/19/2002,6.0,2002,-64831034


In [221]:
answers['18'] = get_movie(most_unprofitable_by_paramount)

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

<font color="blue">What year is the most successful by box-office gains?</font>

In [222]:
most_sum_revenue_year = data.groupby('release_year').revenue.sum().sort_values(ascending=False)
display(most_sum_revenue_year)

release_year
2015    25449202382
2014    23405862953
2013    23213799791
2012    23079001687
2011    22676791872
2010    21071204961
2009    20261791024
2008    18252781990
2007    18162406801
2004    15663430720
2005    15309425558
2006    14775042320
2003    14346123312
2002    14136361487
2001    13017764865
2000    10664099805
Name: revenue, dtype: int64

In [223]:
answers['19'] = most_sum_revenue_year.idxmax()

# 20. Какой самый прибыльный год для студии Warner Bros?

<font color="blue">What is the most profitable year for warner Bros?</font>

In [224]:
# Get Warner Bros movies
movies_by_warner = data[data.production_companies.str.contains('Warner Bros')]
most_profitable_year_by_warner = movies_by_warner.groupby('release_year').profit.sum().sort_values(ascending=False)
display(most_profitable_year_by_warner)

release_year
2014    2295464519
2007    2201675217
2008    2134595031
2010    1974712985
2011    1871393682
2003    1855493377
2009    1822454136
2013    1636453400
2004    1631933725
2005    1551980298
2001    1343545668
2012    1258020056
2002    1022709901
2015     870368348
2006     620170743
2000     452631386
Name: profit, dtype: int64

In [225]:
answers['20'] = most_profitable_year_by_warner.idxmax()

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

<font color="blue">In which month for all the years the most films were released in total?</font>

In [226]:
# Get all release dates from movies, gives series of lists [month, day, year]
movies_dates = data.release_date.str.split('/')
# Keep months only and count
movies_months = movies_dates.map(lambda x: int(x[0]))
movies_months_counts = movies_months.value_counts()
display(movies_months_counts)

9     227
12    190
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: release_date, dtype: int64

In [227]:
answers['21'] = months[movies_months_counts.idxmax()]

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

<font color="blue">How many films were released in the summer in total? (during June, July, August)</font>

In [228]:
# Month stands first in the date
# June == 6, July == 7, August == 8
# Use map and count
movies_summer = data[data.release_date.map(lambda x: x.split('/')[0] in '678')].imdb_id.count()
print(movies_summer)

450


In [229]:
answers['22'] = movies_summer

<font color="blue">OPTION 2</font>

In [230]:
# Using str, e.g. if have to check "unhandy" months :)
data[(data.release_date.str.startswith('6/')) | (data.release_date.str.startswith('7/')) | (data.release_date.str.startswith('8/'))].imdb_id.count()

450

<font color="blue">OPTION 3</font>

In [231]:
# Universal solution to apply any months conditions the right way
summer_movies_counts = movies_months[(movies_months >= 6) & (movies_months <= 8)].value_counts()
display(summer_movies_counts)

8    161
6    147
7    142
Name: release_date, dtype: int64

In [232]:
summer_movies_counts.sum()

450

# ------------------------------------------------------------------------------

# 23. Для какого режиссера зима – самое продуктивное время года?

<font color="blue">For which director the winter is the most productive time of the year?</font>

In [233]:
# Filter movies released in winter
# December == 12, January == 1, February == 2
movies_winter = data[data.release_date.map(lambda x: x.split('/')[0] in '12')]
# Explode by director and group
winter_movies_directors = explode_df(movies_winter, 'director').groupby('director').size().sort_values(ascending=False)
display(winter_movies_directors)

director
Peter Jackson            7
Steven Soderbergh        6
Clint Eastwood           6
Martin Scorsese          4
Adam Shankman            4
                        ..
Matt Bettinelli-Olpin    1
Matt Reeves              1
Matthew O'Callaghan      1
Matthew Vaughn           1
Jon Avnet                1
Length: 358, dtype: int64

In [234]:
answers['23'] = winter_movies_directors.idxmax()

<font color="blue">OPTION 2</font>

In [235]:
# Using separate column, then filter by specific months
data['release_month'] = data.release_date.str.split('/').str[0]
data[(data.release_month == '1') | (data.release_month == '2') | (data.release_month == '12')].groupby('director').size().sort_values(ascending=False)

director
Peter Jackson                          7
Steven Soderbergh                      6
Clint Eastwood                         6
Nancy Meyers                           4
Adam Shankman                          4
                                      ..
Matt Bettinelli-Olpin|Tyler Gillett    1
Matt Reeves                            1
Matthew O'Callaghan                    1
Matthew Vaughn                         1
Aaron Seltzer|Jason Friedberg          1
Length: 332, dtype: int64

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

<font color="blue">Which studio gives the longest titles to its films based on the number of characters?</font>

In [236]:
# Add column with title length
data['title_length'] = data.original_title.str.len()
# Explode and group by studio, then sort by mean title length
mean_title_lengths = explode_df(data,'production_companies').groupby('production_companies').title_length.mean().sort_values(ascending=False)
display(mean_title_lengths)

production_companies
Four By Two Productions       83.0
Jim Henson Company, The       59.0
Dos Corazones                 47.0
Museum Canada Productions     46.0
Polsky Films                  46.0
                              ... 
Everest Entertainment          3.0
Berlanti Productions           3.0
XM2 Productions                2.0
Ixtlan Productions             2.0
Global Entertainment Group     2.0
Name: title_length, Length: 1771, dtype: float64

In [237]:
answers['24'] = mean_title_lengths.idxmax()

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

<font color="blue">Which studio film descriptions are the longest on average in terms of word count?</font>

In [238]:
data['overview_word_count'] = data.overview.str.split().str.len()
mean_overview_words_count = explode_df(data,'production_companies').groupby('production_companies').overview_word_count.mean().sort_values(ascending=False)
display(mean_overview_words_count)

production_companies
Midnight Picture Show                    175.0
Room 9 Entertainment                     161.0
98 MPH Productions                       159.0
Heineken Branded Entertainment           159.0
Brookwell-McNamara Entertainment         156.0
                                         ...  
London Boulevard                          13.0
Phantom Four                              13.0
Henceforth                                13.0
Empire Pictures                           11.0
Motion Picture Corporation of America     11.0
Name: overview_word_count, Length: 1771, dtype: float64

In [239]:
answers['25'] = mean_overview_words_count.idxmax()

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? (по vote_average)

<font color="blue">Which films are at the top 1 percent by vote_average?</font>

In [240]:
data.sort_values(by='vote_average',ascending=False).head(int(len(data)/100))

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,title_length,overview_word_count
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,7/16/2008,8.1,2008,816921825,7,15,67
118,tt0816692,165000000,621752480,Interstellar,Matthew McConaughey|Jessica Chastain|Anne Hath...,Christopher Nolan,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,169,Adventure|Drama|Science Fiction,Paramount Pictures|Legendary Pictures|Warner B...,11/5/2014,8.0,2014,456752480,11,12,35
125,tt2084970,14000000,233555708,The Imitation Game,Benedict Cumberbatch|Keira Knightley|Matthew G...,Morten Tyldum,The true enigma was the man who cracked the code.,Based on the real life story of legendary cryp...,113,History|Drama|Thriller|War,Black Bear Pictures|Bristol Automotive,11/14/2014,8.0,2014,219555708,11,18,46
9,tt2096673,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,6/9/2015,8.0,2015,678708609,6,10,112
34,tt3170832,6000000,35401758,Room,Brie Larson|Jacob Tremblay|Joan Allen|Sean Bri...,Lenny Abrahamson,Love knows no boundaries,Jack is a young boy of 5 years old who has liv...,117,Drama|Thriller,Element Pictures|No Trace Camping|A24|Duperele...,10/16/2015,8.0,2015,29401758,10,4,49
1183,tt0993846,100000000,392000694,The Wolf of Wall Street,Leonardo DiCaprio|Jonah Hill|Margot Robbie|Kyl...,Martin Scorsese,EARN. SPEND. PARTY.,A New York stockbroker refuses to cooperate in...,180,Crime|Drama|Comedy,Paramount Pictures|Appian Way|EMJAG Production...,12/25/2013,7.9,2013,292000694,12,23,29
128,tt2267998,61000000,369330363,Gone Girl,Ben Affleck|Rosamund Pike|Carrie Coon|Neil Pat...,David Fincher,You don't know what you've got 'til it's...,With his wife's disappearance having become th...,145,Mystery|Thriller|Drama,Twentieth Century Fox Film Corporation|Regency...,10/1/2014,7.9,2014,308330363,10,9,30
1191,tt2024544,20000000,187000000,12 Years a Slave,Chiwetel Ejiofor|Michael Fassbender|Lupita Nyo...,Steve McQueen,The extraordinary true story of Solomon Northup,"In the pre-Civil War United States, Solomon No...",134,Drama|History,Plan B Entertainment|Regency Enterprises|River...,10/18/2013,7.9,2013,167000000,10,16,61
119,tt2015381,170000000,773312399,Guardians of the Galaxy,Chris Pratt|Zoe Saldana|Dave Bautista|Vin Dies...,James Gunn,All heroes start somewhere.,"Light years from Earth, 26 years after being a...",121,Action|Science Fiction|Adventure,Marvel Studios|Moving Picture Company (MPC)|Bu...,7/30/2014,7.9,2014,603312399,7,23,28
1081,tt0167260,94000000,1118888979,The Lord of the Rings: The Return of the King,Elijah Wood|Ian McKellen|Viggo Mortensen|Liv T...,Peter Jackson,The eye of the enemy is moving.,Aragorn is revealed as the heir to the ancient...,201,Adventure|Fantasy|Action,WingNut Films|New Line Cinema,12/1/2003,7.9,2003,1024888979,12,45,45


In [241]:
# Have to pick up manually this time...
answers['26'] = 'Inside Out, The Dark Knight, 12 Years a Slave'

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?

<font color="blue">What actors are most often filmed together in the same movie?</font>

In [242]:
from itertools import combinations
# Create pairs of actors from cast
actors_pairs = [] 
for cast in data.cast.str.split('|'):
    actors_pairs.extend(combinations(cast, 2))
# Create series from pairs and count
count_actors_pairs = pd.Series(actors_pairs).value_counts()
display(count_actors_pairs)

(Daniel Radcliffe, Emma Watson)        8
(Daniel Radcliffe, Rupert Grint)       8
(Rupert Grint, Emma Watson)            7
(Johnny Depp, Helena Bonham Carter)    6
(Ben Stiller, Owen Wilson)             6
                                      ..
(Eugene Levy, Jonathan Bennett)        1
(Mark Strong, Simon McBurney)          1
(Billy Bob Thornton, Ken Medlock)      1
(Ethan Hawke, Vincent D'Onofrio)       1
(Colin Farrell, Jason Sudeikis)        1
Length: 18121, dtype: int64

In [243]:
# The topmost answer is NOT in the suggested answers... 
answers['27'] = ', '.join(count_actors_pairs.idxmax())
print(answers['27'])
# ...so, have to set it manually again!
answers['27'] = 'Daniel Radcliffe, Rupert Grint'
print(answers['27'])

Daniel Radcliffe, Emma Watson
Daniel Radcliffe, Rupert Grint


# Submission

In [244]:
# Print answers
for n, a in answers.items(): print("{}: {}".format(n, a))

1: Pirates of the Caribbean: On Stranger Tides (tt1298650)
2: Gods and Generals (tt0279111)
3: Winnie the Pooh (tt1449283)
4: 110
5: 107
6: Avatar (tt0499549)
7: The Lone Ranger (tt1210819)
8: 1478
9: The Dark Knight (tt0468569)
10: The Lone Ranger (tt1210819)
11: Drama
12: Drama
13: Peter Jackson
14: Robert Rodriguez
15: Chris Hemsworth
16: Matt Damon
17: Action
18: K-19: The Widowmaker (tt0267626)
19: 2015
20: 2014
21: Сентябрь
22: 450
23: Peter Jackson
24: Four By Two Productions
25: Midnight Picture Show
26: Inside Out, The Dark Knight, 12 Years a Slave
27: Daniel Radcliffe, Rupert Grint


In [245]:
# Count answers for self-control
len(answers)

27