In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import linregress
import cpi
cpi.update()

ModuleNotFoundError: No module named 'cpi'

In [None]:
# CSV Path
imdb_movies_csv = "Data/imdb_movies.csv"
box_office_csv = "Data/boxoffice.csv"

In [None]:
# Variables for the seperate data frames 
imdb_df = pd.read_csv(imdb_movies_csv)
box_office_df = pd.read_csv(box_office_csv)

In [None]:
# Removing columns I do not need
col_imdb_df = imdb_df[['imdb_title_id', 'original_title', 'year', 'date_published', 'genre', 'duration', 'country', 
                         'language', 'director', 'writer', 'production_company', 'actors', 'avg_vote', 'votes']]

col_box_office_df = box_office_df[['title', 'lifetime_gross', 'year']]


In [None]:
# Renaming columns to be more uniform
col_imdb_df = col_imdb_df.rename(columns={'imdb_title_id': "IMDB ID", 'original_title': 'Title', 'year': 'Year', 
                                          'date_published': 'Date Published', 'genre': 'Genre', 'duration': 'Duration',
                                         'country': 'Country', 'language': 'Language', 'director': 'Director', 'writer': 'Writer',
                                         'production_company': 'Production Company', 'actors': 'Actors', 'avg_vote': 'Avg Rating',
                                          'votes': '# of Ratings'})

col_box_office_df = col_box_office_df.rename(columns={'title': 'Title', 'lifetime_gross': 'Lifetime_Gross', 'year': 'Year'})

In [None]:
# Merging data frames
movie_data = pd.merge(col_imdb_df, col_box_office_df, on=['Title', 'Year'], how='inner')

# Dropping data that is not available through CPI
movie_data.drop(movie_data[movie_data['Year'] == 1912 ].index , inplace=True)
movie_data.drop(movie_data[movie_data['Year'] == 2019 ].index , inplace=True)
movie_data.drop(movie_data[movie_data['Year'] == 2020 ].index , inplace=True)

In [None]:
# Sort IMDB ID
movie_data.sort_values("IMDB ID", inplace = True)

# Drop duplicate IMDB ID
movie_data.drop_duplicates(subset ="IMDB ID", keep = False, inplace = True)

In [None]:
# Calculating inflation through CPI library 
# Source: https://medium.com/analytics-vidhya/adjusting-for-inflation-when-analysing-historical-data-with-python-9d69a8dcbc27
def inflate_column(data, column):
    
    return data.apply(lambda x: cpi.inflate(x[column], x.Year), axis=1)          
    
movie_data['Lifetime Gross - Adjusted ($)'] = inflate_column(movie_data, 'Lifetime_Gross')

In [None]:
# Converting scientific notation
movie_data["Lifetime Gross - Adjusted ($)"].describe().apply(lambda x: format(x, 'f'))
pd.options.display.float_format = '{:,.2f}'.format

# Formatting columns
movie_data['Lifetime_Gross'] = movie_data['Lifetime_Gross'].astype(float).map("${:,.2f}".format)
movie_data['Lifetime Gross - Adjusted ($)'] = movie_data['Lifetime Gross - Adjusted ($)'].astype(float).map("${:,.2f}".format)

In [None]:
movie_data.head()

In [None]:
# Pushing the cleaned DF to a CSV file
movie_data.to_csv("Data/data_cleaning.csv")
