In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import linregress

In [4]:
# CSV Path
imdb_movies_csv = "Data/imdb_movies.csv"
box_office_csv = "Data/boxoffice.csv"

In [5]:
# Variables for the seperate data frames 
imdb_df = pd.read_csv(imdb_movies_csv)
box_office_df = pd.read_csv(box_office_csv)

In [6]:
# Removing columns I do not need
col_imdb_df = imdb_df[['imdb_title_id', 'original_title', 'year', 'date_published', 'genre', 'duration', 'country', 
                         'language', 'director', 'writer', 'production_company', 'actors', 'avg_vote', 'votes']]

col_box_office_df = box_office_df[['title', 'lifetime_gross']]


In [7]:
# Renaming columns to be more uniform
col_imdb_df = col_imdb_df.rename(columns={'imdb_title_id': "IMDB ID", 'original_title': 'Title', 'year': 'Year', 
                                          'date_published': 'Date Published', 'genre': 'Genre', 'duration': 'Duration',
                                         'country': 'Country', 'language': 'Language', 'director': 'Director', 'writer': 'Writer',
                                         'production_company': 'Production Company', 'actors': 'Actors', 'avg_vote': 'Avg Rating',
                                          'votes': '# of Ratings'})

col_box_office_df = col_box_office_df.rename(columns={'title': 'Title', 'lifetime_gross': 'Lifetime Gross ($)'})

In [8]:
# Merging data frames
movie_data = pd.merge(col_imdb_df, col_box_office_df, on='Title')

# Formatting columns
movie_data['Lifetime Gross ($)'] = movie_data['Lifetime Gross ($)'].astype(float).map("${:,.2f}".format)

In [9]:
# Pushing the cleaned DF to a CSV file
movie_data.to_csv("Data/movie_data_final.csv")