### Exploratory Data Analysis for US Movies from 2000 to 2001

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os, time,json
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas1.csv.gz',
 'title_basics1.csv.gz',
 'title_ratings1.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

#### Import dataframes

In [2]:
df_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
df_2000 = df_2000.iloc[1:]

In [3]:
df_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
df_2001 = df_2001.iloc[1:]

#### Combining dataframes

In [4]:
tmdb_results_combined = pd.concat([df_2000, df_2001], ignore_index=True)

#### Review dataframe

In [5]:
tmdb_results_combined.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
1,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
2,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
3,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,"{'id': 1131062, 'name': 'Wong Kar-Wai’s Love T...",150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.11,2235.0,PG
4,tt0118852,0.0,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,49511.0,en,Chinese Coffee,...,0.0,99.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,0.0,6.8,50.0,R


#### Finding Null Values

In [6]:
tmdb_results_combined.isna().sum()

imdb_id                     0
adult                       0
backdrop_path            1172
belongs_to_collection    2383
budget                      0
genres                      0
homepage                 2420
id                          0
original_language           0
original_title              0
overview                   48
popularity                  0
poster_path               257
production_companies        0
production_countries        0
release_date               24
revenue                     0
runtime                     0
spoken_languages            0
status                      0
tagline                  1480
title                       0
video                       0
vote_average                0
vote_count                  0
certification            1776
dtype: int64

In [7]:
#Display the percentage of missing values by row. 
print(tmdb_results_combined.isna().sum()/len(tmdb_results_combined)*100)

imdb_id                   0.000000
adult                     0.000000
backdrop_path            45.233501
belongs_to_collection    91.972212
budget                    0.000000
genres                    0.000000
homepage                 93.400232
id                        0.000000
original_language         0.000000
original_title            0.000000
overview                  1.852567
popularity                0.000000
poster_path               9.918950
production_companies      0.000000
production_countries      0.000000
release_date              0.926283
revenue                   0.000000
runtime                   0.000000
spoken_languages          0.000000
status                    0.000000
tagline                  57.120803
title                     0.000000
video                     0.000000
vote_average              0.000000
vote_count                0.000000
certification            68.544963
dtype: float64


#### Dropping Columns Missing 80% or <

In [8]:
tmdb_results_combined.drop(['belongs_to_collection','homepage'], axis=1,inplace=True)

#### Replace remaining NAN with placeholder

In [9]:
tmdb_results_combined = tmdb_results_combined.fillna('Missing')

In [10]:
tmdb_results_combined.isna().sum()

imdb_id                 0
adult                   0
backdrop_path           0
budget                  0
genres                  0
id                      0
original_language       0
original_title          0
overview                0
popularity              0
poster_path             0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
video                   0
vote_average            0
vote_count              0
certification           0
dtype: int64

* Data cleaned

#### Saving dataframe into file

In [11]:
tmdb_results_combined.to_csv("Data/tmdb_results_combined.csv.gz",compression='gzip',index=False)

#### Films with a value greater than 0 for both budget and revenue

In [12]:
cash = tmdb_results_combined[(tmdb_results_combined['budget'] > 0) & (tmdb_results_combined['revenue'] > 0)]



#### Number of Movies

In [13]:
print(cash['title'].count())

355


#### Names of Movies

In [14]:
print(cash['title'].to_string())

3                                    In the Mood for Love
10                                                 Vulgar
12                                            Chicken Run
15                               The Million Dollar Hotel
16                                 Mission: Impossible II
17                                                  X-Men
18                                             Titan A.E.
19                               The Emperor's New Groove
21                                           Return to Me
24                                        Waking the Dead
26                                               Dinosaur
27                   The Adventures of Rocky & Bullwinkle
31                                               Scream 3
33                                            Pitch Black
34                                              Supernova
38                                  The Road to El Dorado
44                                                  U-571
47            

#### How many movies are there in each of the certification categories (G/PG/PG-13/R), what are their avg revenue and budget

In [15]:
ratings = ['G', 'PG', 'PG-13', 'R']

for rating in ratings:
    cash_money = cash[cash['certification'] == rating]
    count = cash_money.shape[0]
    avg_revenue = cash_money['revenue'].mean()
    avg_budget = cash_money['budget'].mean()
    print("Rating:", rating)
    print("Number of films:", count)
    print("Average revenue:", avg_revenue.round())
    print("Average budget:", avg_budget.round())
    print()


Rating: G
Number of films: 13
Average revenue: 133216882.0
Average budget: 44000000.0

Rating: PG
Number of films: 30
Average revenue: 129125593.0
Average budget: 51974908.0

Rating: PG-13
Number of films: 117
Average revenue: 111101802.0
Average budget: 46627328.0

Rating: R
Number of films: 143
Average revenue: 52061243.0
Average budget: 26724574.0

