# Probability and Movies
### Looking at data from IMDB

#### Selena Flannery -- November 15, 2016

In [146]:
import pandas as pd
import numpy as np

filename = 'data/movie_metadata.csv'

m = pd.read_csv(filename)
m.movie_title = m.movie_title.str.strip()
m.duration = pd.to_numeric(m.duration)
m.budget = pd.to_numeric(m.budget)
m.gross = pd.to_numeric(m.gross)
m.imdb_score = pd.to_numeric(m.imdb_score)
m.set_index("movie_title", inplace=True)
m.drop(["color", "director_facebook_likes", "actor_3_facebook_likes", "actor_2_name", "actor_1_facebook_likes", "actor_1_name"], axis=1, inplace=True)
m.drop(["cast_total_facebook_likes", "movie_imdb_link", "language", "actor_2_facebook_likes", "aspect_ratio"], axis=1, inplace=True)
m.drop(["actor_3_name", "facenumber_in_poster", "plot_keywords", "country"], axis=1, inplace=True)

In [122]:
m.columns

Index(['director_name', 'num_critic_for_reviews', 'duration', 'gross',
       'genres', 'num_voted_users', 'num_user_for_reviews', 'content_rating',
       'budget', 'title_year', 'imdb_score', 'movie_facebook_likes'],
      dtype='object')

## What is the probability that...

### A movie was longer than an hour and a half?

In [123]:
ninety_min_num_movies = len(m.duration[m.duration > 90.0])
total_num_movies = len(m.index[m.duration != np.nan])
ninety_min_num_movies/total_num_movies

0.8080507634344636

### A movie was longer than two hours?

In [124]:
two_hour_movies = len(m.duration[m.duration > 120.0])
two_hour_movies/total_num_movies

0.2115804084870117

### A movie was directed by Steven Spielberg?

In [125]:
num_movies_directed = len(m.director_name[m.director_name != np.nan])
spiel = len(m.director_name[m.director_name == "Steven Spielberg"])
spiel/num_movies_directed

0.005155661312710688

### A movie directed by Clint Eastwood will gross under budget?

In [140]:
e_movies = m[m.director_name == "Clint Eastwood"]
e_gross_under_budget = len(e_movies[(e_movies.gross != np.nan) & (e_movies.budget != np.nan) & (e_movies.gross < e_movies.budget)])
e_gross_under_budget/len(e_movies.index)

0.35

### A movie generally grossed more than its budget?

In [176]:
movies_with_budget_and_gross = m[(~pd.isnull(m.gross)) & (~pd.isnull(m.budget))]
gross_over_budget = m[(m.gross > m.budget) & (~pd.isnull(m.gross)) & (~pd.isnull(m.budget))]

len(gross_over_budget)/len(movies_with_budget_and_gross)


0.5268568491390389

### A movie grossed over the average gross of this data set?

In [177]:
average_gross = m.gross.mean()
movie_grossed_over_average = len(m.index[(~pd.isnull(m.gross)) & (m.gross > average_gross)])
total_movie_with_gross = len(m.index[~pd.isnull(m.gross)])
movie_grossed_over_average/total_movie_with_gross

0.31882664101947583

## False Positives

###  A movie that was highly-rated but did poorly in the box office (gross < budget)

In [156]:
false_positives = m[(m.imdb_score != np.nan) & (m.gross!= np.nan) & (m.budget != np.nan) & (m.imdb_score >=6) & (m.gross < m.budget)]
false_positives[["imdb_score", "gross", "budget"]]

Unnamed: 0_level_0,imdb_score,gross,budget
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spectre,6.8,200074175.0,245000000.0
John Carter,6.6,73058679.0,263700000.0
Tangled,7.8,200807262.0,260000000.0
Superman Returns,6.1,200069408.0,209000000.0
Quantum of Solace,6.7,168368427.0,200000000.0
The Lone Ranger,6.5,89289910.0,215000000.0
The Chronicles of Narnia: Prince Caspian,6.6,141614023.0,225000000.0
Pirates of the Caribbean: On Stranger Tides,6.7,241063875.0,250000000.0
Men in Black 3,6.8,179020854.0,225000000.0
Robin Hood,6.7,105219735.0,200000000.0
