In [174]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import linregress

In [175]:
# CSV Path
imdb_movies_csv = "Data/imdb_movies.csv"
box_office_csv = "Data/boxoffice.csv"

In [176]:
# Variables for the seperate data frames 
imdb_df = pd.read_csv(imdb_movies_csv)
box_office_df = pd.read_csv(box_office_csv)

In [192]:
# Removing columns I do not need
col_imdb_df = imdb_df[['imdb_title_id', 'original_title', 'year', 'date_published', 'genre', 'duration', 'country', 
                         'language', 'director', 'writer', 'production_company', 'actors', 'avg_vote', 'votes']]

col_box_office_df = box_office_df[['title', 'lifetime_gross', 'year']]


In [193]:
# Renaming columns to be more uniform
col_imdb_df = col_imdb_df.rename(columns={'imdb_title_id': "IMDB ID", 'original_title': 'Title', 'year': 'Year',
                                          'date_published': 'Date Published', 'genre': 'Genre', 'duration': 'Duration',
                                         'country': 'Country', 'language': 'Language', 'director': 'Director', 'writer': 'Writer',
                                         'production_company': 'Production Company', 'actors': 'Actors', 'avg_vote': 'Avg Rating',
                                          'votes': '# of Ratings'})

col_box_office_df = col_box_office_df.rename(columns={'title': 'Title', 'lifetime_gross': 'Lifetime Gross ($)', 'year': 'Year'})

In [216]:
# Merging data frames
movie_data = pd.merge(col_imdb_df, col_box_office_df, on=['Title', 'Year'], how="inner") 

# Formatting columns
movie_data['Lifetime Gross ($)'] = movie_data['Lifetime Gross ($)'].astype(float).map("${:,.2f}".format)

In [202]:
# Sort IMDB ID
movie_data.sort_values("IMDB ID", inplace = True)

In [203]:
# Drop duplicate IMDB ID
movie_data.drop_duplicates(subset ="IMDB ID", keep = False, inplace = True)

In [219]:
# Pushing the cleaned DF to a CSV file
movie_data.to_csv("Data/movie_data_final_2.csv")

In [217]:
movie_data.sort_values('Title')

Unnamed: 0,IMDB ID,Title,Year,Date Published,Genre,Duration,Country,Language,Director,Writer,Production Company,Actors,Avg Rating,# of Ratings,Lifetime Gross ($)
5239,tt0790799,$9.99,2008,2009-09-17,"Animation, Drama, Fantasy",78,"Israel, Australia",English,Tatia Rosenthal,"Etgar Keret, Etgar Keret",Lama Films,"Geoffrey Rush, Anthony LaPaglia, Samuel Johnso...",6.7,3067,"$52,384.00"
960,tt0090557,'Round Midnight,1986,1987-01-09,"Drama, Music",133,"USA, France","English, French",Bertrand Tavernier,"David Rayfiel, Bertrand Tavernier",Little Bear,"Dexter Gordon, François Cluzet, Gabrielle Hake...",7.4,4351,"$3,272,593.00"
5535,tt1022603,(500) Days of Summer,2009,2009-09-02,"Comedy, Drama, Romance",95,USA,"English, French, Swedish",Marc Webb,"Scott Neustadter, Michael H. Weber",Fox Searchlight Pictures,"Joseph Gordon-Levitt, Zooey Deschanel, Geoffre...",7.7,448799,"$32,391,374.00"
5673,tt1132193,(Untitled),2009,2011-06-02,"Comedy, Drama, Music",96,USA,English,Jonathan Parker,"Catherine DiNapoli, Jonathan Parker",Parker Film Company,"Adam Goldberg, Marley Shelton, Eion Bailey, Lu...",6.3,2041,"$230,600.00"
246,tt0078721,10,1979,1980-02-08,"Comedy, Romance",122,USA,English,Blake Edwards,Blake Edwards,Geoffrey Productions,"Dudley Moore, Julie Andrews, Bo Derek, Robert ...",6.1,14286,"$74,865,517.00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6213,tt1608290,Zoolander 2,2016,2016-02-12,"Action, Comedy",101,"USA, Italy","English, Italian, Spanish",Ben Stiller,"Justin Theroux, Ben Stiller",Panorama Films,"Justin Bieber, Jon Daly, Penélope Cruz, Ben St...",4.7,61163,"$28,848,693.00"
4540,tt0383060,Zoom,2006,2006-10-27,"Action, Adventure, Comedy",83,USA,English,Peter Hewitt,"Adam Rifkin, David Berenbaum",Revolution Studios,"Tim Allen, Courteney Cox, Chevy Chase, Spencer...",4.3,17598,"$11,989,328.00"
488,tt0083365,Zoot Suit,1981,1982-01-01,"Drama, Musical",103,USA,English,Luis Valdez,Luis Valdez,Universal Pictures,"Daniel Valdez, Edward James Olmos, Charles Aid...",6.8,973,"$3,256,082.00"
6963,tt2948356,Zootopia,2016,2016-03-18,"Animation, Adventure, Comedy",108,USA,English,"Byron Howard, Rich Moore","Byron Howard, Rich Moore",Walt Disney Pictures,"Ginnifer Goodwin, Jason Bateman, Idris Elba, J...",8.0,396148,"$341,268,248.00"
