In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from itertools import permutations
from itertools import chain
import datetime as dt

In [3]:
data = pd.read_csv('movie_bd_v5.xls')
data.head()

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
0,tt0369610,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,6.5,2015
1,tt1392190,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,7.1,2015
2,tt2908446,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,6.3,2015
3,tt2488496,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,7.5,2015
4,tt2820852,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,7.3,2015


In [4]:
# adding column "profit"
data = data.assign(profit = data['revenue'] - data['budget'])

# changing date to DateTime format
data['release_date'] = pd.to_datetime(data['release_date'])

# putting moths into separate column
data['month'] = pd.DatetimeIndex(data['release_date']).month

# copying dataset
data2 = data.copy()

### 1. Which movie has the largest budget?

In [5]:
data[data.budget == data.budget.max()].original_title

723    Pirates of the Caribbean: On Stranger Tides
Name: original_title, dtype: object

### 2. Which movie is the longest? (in minutes)

In [6]:
data[data.runtime == data.runtime.max()].original_title

1157    Gods and Generals
Name: original_title, dtype: object

### 3. Which movie is the shortest? (in minutes)

In [8]:
data.sort_values(by = 'runtime').head(1).original_title

768    Winnie the Pooh
Name: original_title, dtype: object

### 4. What is the average movie length? 

In [9]:
round(data.runtime.mean())

110

### 5. What is the meadian movie length?

In [10]:
data.runtime.quantile(q=0.5)

107.0

### 6. What is the most profitable movie?

In [12]:
data[data.profit == data.profit.max()].original_title

239    Avatar
Name: original_title, dtype: object

### 7. Which movie is the most unprofitable

In [13]:
data[data.profit == data.profit.min()].original_title

1245    The Lone Ranger
Name: original_title, dtype: object

### 8. How many movies from the dataset raised more money than it's budget?

In [14]:
data[data.revenue > data.budget]['imdb_id'].count()

1478

OR

In [15]:
data[data.revenue > data.budget]['imdb_id'].count()

1478

### 9. Which movie had the highest revenue in 2008?

In [16]:
max_release = data[data['release_year'] == 2008].revenue.max()
data[data.revenue == max_release].original_title

599    The Dark Knight
Name: original_title, dtype: object

### 10. The most unprofitable movie from 2012 to 2014 (inclusive)?

In [20]:
data[data['release_year'].isin([2012, 2013, 2014])].\
sort_values(by = 'profit',ascending = True).head(1).original_title

1245    The Lone Ranger
Name: original_title, dtype: object

### 11. The most frequent genre of movies in the dataset?

In [21]:
Counter(data2.genres.str.split('|').sum()).most_common(1)

[('Drama', 782)]

OR


In [22]:
data2.genres = data2.genres.str.split('|')
data2 = data2.explode('genres')
data2.genres.value_counts(ascending=False).head(1)

Drama    782
Name: genres, dtype: int64

### 12. Movies od which genre are the most profitable?

In [23]:
Counter(data2[data2.profit > 0].genres.str.split('|').sum()).most_common(1)

[('Drama', 560)]

### 13. Which director has the highest gross box office receipts?

In [25]:
data2 = data.copy()
data2.director = data2.director.str.split('|')
data2 = data2.explode('director')
data2.groupby(by='director')['revenue'].sum().sort_values(ascending=False).head(1)

director
Peter Jackson    6490593685
Name: revenue, dtype: int64

### 14. Which director filmed Action movies the most?

In [26]:
data3 = data[data['genres'].str.contains('Action')].copy()
data3.director = data3.director.str.split('|')
data3 = data3.explode('director')
data3.director.value_counts().head(1)

Robert Rodriguez    9
Name: director, dtype: int64

### 15. Movies starring what actor has the highest gross box office receipts in 2012?

In [27]:
data3 = data[data['release_year'] == 2012].copy()
data3.cast = data3.cast.str.split('|')
data3 = data3.explode('cast')
data3.groupby('cast')['revenue'].sum().sort_values(ascending=False).head(1)

cast
Chris Hemsworth    2027450773
Name: revenue, dtype: int64

### 16. Which actor has starred in more big-budget movies?

In [28]:
data3 = data[data.budget > data.budget.mean()].copy()
data3.cast = data3.cast.str.split('|')
data3 = data3.explode('cast')
data3.cast.value_counts().head(1)

Matt Damon    18
Name: cast, dtype: int64

### 17. What genre has Nicolas Cage starred in the most?

In [29]:
data3 = data[data['cast'].str.contains('Nicolas Cage')].copy()
data3.genres = data3.genres.str.split('|')
data3 = data3.explode('genres')
data3.genres.value_counts().head(1)

Action    17
Name: genres, dtype: int64

### 18. The most unprofitable film from Paramount Pictures?

In [30]:
data3 = data[data['production_companies'].str.contains('Paramount Pictures')].copy()
data3[data3['profit'] == data3.profit.min()].original_title

925    K-19: The Widowmaker
Name: original_title, dtype: object

### 19. Which year was the most successful in terms of cumulative box office receipts?

In [31]:
data.groupby('release_year').revenue.sum().sort_values(ascending=False).head(1)

release_year
2015    25449202382
Name: revenue, dtype: int64

### 20. Which year was the most successful in terms of cumulative box office receipts for Warner Bros?

In [32]:
data3 = data[data['production_companies'].str.contains('Warner Bros')].copy()
data3.groupby('release_year').profit.sum().sort_values(ascending=False).head(1)

release_year
2014    2295464519
Name: profit, dtype: int64

### 21. In which month the most films were released for all the years in total?

In [33]:
data.release_date.dt.month.value_counts().head(1)

9    227
Name: release_date, dtype: int64

### 22. How many movies were released in the summer in total? (for June, July, August)

In [34]:
data['month'] = pd.DatetimeIndex(data['release_date']).month
data[data['month'].isin(['6', '7', '8'])].month.count()

450

### 23. For which director winter is the most productive time of the year?

In [35]:
winter = data[data['month'].isin(['1', '2', '12'])].copy()
winter.director = winter.director.str.split('|')
winter = winter.explode('director')
winter.director.value_counts().sort_values(ascending = False).head(1)

Peter Jackson    7
Name: director, dtype: int64

### 24. Which studio gives the longest titles to its films in terms of the number of characters?

In [37]:
data3 = data.copy()
data3['production_companies'] = data3['production_companies'].str.split('|')
data3 = data3.explode('production_companies')
data3 = data3.assign(title_len = data3.original_title.apply(lambda x: len(x)))
data3.groupby('production_companies')['title_len'].mean().sort_values(ascending = False).head(1)

production_companies
Four By Two Productions    83.0
Name: title_len, dtype: float64

### 25. Which studio's film descriptions are on average the longest in terms of word count?

In [38]:
data3 = data3.assign(nwords = data3.overview.str.split(' ').map(len))
data3.groupby('production_companies')['nwords'].mean().sort_values(ascending = False).head(1)

production_companies
Midnight Picture Show    175.0
Name: nwords, dtype: float64

### 26. Which films are in the top 1 percent of ratings?

In [39]:
one_percent = np.percentile(data.vote_average, 99)
data[data['vote_average'] > one_percent]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,month
9,tt2096673,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,2015-06-09,8.0,2015,678708609,6
34,tt3170832,6000000,35401758,Room,Brie Larson|Jacob Tremblay|Joan Allen|Sean Bri...,Lenny Abrahamson,Love knows no boundaries,Jack is a young boy of 5 years old who has liv...,117,Drama|Thriller,Element Pictures|No Trace Camping|A24|Duperele...,2015-10-16,8.0,2015,29401758,10
118,tt0816692,165000000,621752480,Interstellar,Matthew McConaughey|Jessica Chastain|Anne Hath...,Christopher Nolan,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,169,Adventure|Drama|Science Fiction,Paramount Pictures|Legendary Pictures|Warner B...,2014-11-05,8.0,2014,456752480,11
119,tt2015381,170000000,773312399,Guardians of the Galaxy,Chris Pratt|Zoe Saldana|Dave Bautista|Vin Dies...,James Gunn,All heroes start somewhere.,"Light years from Earth, 26 years after being a...",121,Action|Science Fiction|Adventure,Marvel Studios|Moving Picture Company (MPC)|Bu...,2014-07-30,7.9,2014,603312399,7
125,tt2084970,14000000,233555708,The Imitation Game,Benedict Cumberbatch|Keira Knightley|Matthew G...,Morten Tyldum,The true enigma was the man who cracked the code.,Based on the real life story of legendary cryp...,113,History|Drama|Thriller|War,Black Bear Pictures|Bristol Automotive,2014-11-14,8.0,2014,219555708,11
128,tt2267998,61000000,369330363,Gone Girl,Ben Affleck|Rosamund Pike|Carrie Coon|Neil Pat...,David Fincher,You don't know what you've got 'til it's...,With his wife's disappearance having become th...,145,Mystery|Thriller|Drama,Twentieth Century Fox Film Corporation|Regency...,2014-10-01,7.9,2014,308330363,10
138,tt2278388,30000000,174600318,The Grand Budapest Hotel,Ralph Fiennes|Tony Revolori|F. Murray Abraham|...,Wes Anderson,A perfect holiday without leaving home.,The Grand Budapest Hotel tells of a legendary ...,99,Comedy|Drama,Fox Searchlight Pictures|Scott Rudin Productio...,2014-02-26,7.9,2014,144600318,2
370,tt1375666,160000000,825500000,Inception,Leonardo DiCaprio|Joseph Gordon-Levitt|Ellen P...,Christopher Nolan,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es...",148,Action|Thriller|Science Fiction|Mystery|Adventure,Legendary Pictures|Warner Bros.|Syncopy,2010-07-14,7.9,2010,665500000,7
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,2008-07-16,8.1,2008,816921825,7
872,tt0253474,35000000,120072577,The Pianist,Adrien Brody|Thomas Kretschmann|Frank Finlay|M...,Roman Polanski,Music was his passion. Survival was his master...,The Pianist is a film adapted from the biograp...,150,Drama|War,Bac Films|Canal+Polska|Heritage Films|Studio B...,2002-09-24,7.9,2002,85072577,9


### 27. Which pair of actors star together more then others?

In [None]:
data2 = data.copy()
data2.cast = data2.cast.str.split('|')
data2['cast']=data2.cast.apply(lambda x: list(permutations(x, 2)))
actor_list = data2.cast.tolist()
merged = list(chain(*actor_list))
Counter(merged).most_common(1)

[(('Daniel Radcliffe', 'Rupert Grint'), 8)]