In [1]:
# Necessary imports
import pandas as pd
import seaborn as sns
import numpy as np
# We want all columns displayed, setting output to 50 to be safe
pd.set_option('display.max_columns',100)
import os, time, json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [2]:
with open('/Users/Rovidicus/.secret/tmdb_api.json','r') as f:
    login=json.load(f)
login.keys()

dict_keys(['API Key', 'Access Token'])

In [3]:
import tmdbsimple as tmdb
tmdb.API_KEY = login['API Key']

In [4]:
basics=pd.read_csv('Data/basics.csv')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [5]:
FOLDER="Data/Hypothesis_Testing/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_tmdb_data_2020.csv.gz',
 'final_tmdb_data_2021.csv.gz',
 'final_tmdb_data_2022.csv.gz',
 'tmdb_api_results_2020.json',
 'tmdb_api_results_2021.json',
 'tmdb_api_results_2022.json',
 'tmdb_results_postcovid.csv.gz']

In [6]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    
    movie_info = movie.info()
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1' ] =='US':
            movie_info['certification'] = c['certification']
    return movie_info


def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        file_data = json.load(file)
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        file.seek(0)
        json.dump(file_data, file)

In [7]:
# We will retreive movie data on years from the pandemic to present
YEARS_TO_GET=list(range(2020, 2023))

In [None]:
# Start of OUTER loop
 for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    JSON_FILE_YEAR = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE_YEAR)

    if file_exists == False:
        print(f'Creating{JSON_FILE_YEAR} for API results for year = {YEAR}.')
        with open (JSON_FILE_YEAR, 'w') as f:
            json.dump([{'imdb_id':0}],f)

    else:
        print(f'The file {JSON_FILE_YEAR} already exists.')

    df_year = basics.loc[basics['startYear'] == YEAR].copy()
    movie_ids = df_year['tconst']
        
        # Create an empty list to store movie information
    movie_info_list = []
    errors=[]    
        # Iterate over movie IDs for the current year
    for movie_id in tqdm_notebook(movie_ids, f'Movies from {YEAR}'):
        try:
            temp = get_movie_with_rating(movie_id)
            movie_info_list.append(temp)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])
        
        # Write the movie information to the JSON file
    write_json(movie_info_list, JSON_FILE_YEAR)

    final_year_df2 = pd.read_json(JSON_FILE_YEAR)
    csv_fname = f'{FOLDER}final_tmdb_data_{YEAR}.csv.gz'
    final_year_df2.to_csv(csv_fname, compression='gzip', index=False)

YEARS:   0%|          | 0/3 [00:00<?, ?it/s]

The file Data/Hypothesis_Testing/tmdb_api_results_2020.json already exists.


Movies from 2020:   0%|          | 0/5010 [00:00<?, ?it/s]

In [None]:
import glob
# Use glob to get all filepaths that match the pattern (*=wildcard)
tmdb_files = sorted(glob.glob("Data/Hypothesis_Testing/final_tmdb_data*.csv.gz"))
tmdb_files

In [None]:
# Use read_csv in a list comprehension and combine with concat to load all files
df = pd.concat([pd.read_csv(f) for f in tmdb_files] )
df.head(10)

In [None]:
#Save final merged csv as 'tmdb_results_combined.csv.gz'
fname = f'{FOLDER}tmdb_results_postcovid.csv.gz'
df.to_csv(fname, compression='gzip',index=False)

1. Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

In [None]:
# We are dealing with several variables and a numeric problem (revenue)
# An analysis of variance (ANOVA) will be useful here
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
# Let's visualize our revenues per genre
sns.barplot(df, x = 'certification', y = 'revenue');

In [None]:
df['certification'].value_counts()

In [None]:
nn = df.copy()

In [None]:
nn = nn.dropna(subset=['certification', 'revenue'])
nn.head(3)

In [None]:
## Each group needs to be separated to be analyzed.
groups = {}
## Loop through all unique categories
for i in nn['certification'].unique():
    ## Get series for group and rename
    data = nn.loc[nn['certification']==i,'revenue'].copy()
    
    # save into the dictionary
    groups[i] = data
groups.keys()

In [None]:
## Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for i, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[i] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T
norm_results_df

In [None]:
### checking sig with pandas 
norm_results_df['sig'] = norm_results_df['p'] < .05 
norm_results_df

In [None]:
# perform the correct hypothesis test
result = stats.f_oneway( *groups.values())
result

### With ANOVA giving us a significant result, we can use tukey to perform comparison tests for each group

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
## save the values as kg_lost and the labels to the Diet
values = nn['revenue']
labels = nn['certification']

In [None]:
## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

##### We can see several significant results between MPAA ratings like PG-13 outperforming R, PG, NR and G. 
##### Not Rated trends poorly against PG and PG-13. While NC-17 doesn't have good revenue, the differences are not considered significant

2. Do movies that are over 2.5 hours long earn more revenue than movies that are 1.5 hours long (or less)?

In [None]:
# Visualizing revenue by runtime
sns.scatterplot(df, x='runtime', y='revenue');

In [None]:
# Filtering films into dfs of long and short movies
long_df = df.loc[df['runtime'] >= 150].copy()
short_df = df.loc[df['runtime']<= 90].copy()

In [None]:
# Now defining revenue dfs
long_r = long_df['revenue']
short_r = short_df['revenue']

In [None]:
# Check for outliers in long film revenue
zscores= stats.zscore(long_r)
outliers = abs(zscores)>3
np.sum(outliers)

In [None]:
# remove outliers from strength group
long_r = long_r[(np.abs(stats.zscore(long_r)) < 3)]

In [None]:
# Checking same for short films
zscores= stats.zscore(short_r)
np.sum(outliers)

In [None]:
# remove outliers
short_r = short_r[(np.abs(stats.zscore(short_r)) < 3)]

In [None]:
# Test for equal variance
result = stats.levene(long_r, short_r)
result

In [None]:
# Independent t-test with equal_var set to False
result = stats.ttest_ind(long_r, short_r, equal_var = False)
result

##### With pvalue far less than 0.05 we have a significant result and can reject null

3. Do different certifications trend to different popularity numbers?

In [None]:
# Making barplot to get a glimpse at correlation
sns.barplot(nn, x='certification', y='popularity');

In [None]:
# There does seem to be correlations though how significant will have to be determined
# It's another ANOVA problem with certifications so we can use our copied df to keep nulls out
groups = {}
## Loop through all unique categories
for i in nn['certification'].unique():
    ## Get series for group and rename
    data = nn.loc[nn['certification']==i,'popularity'].copy()
    
    # save into the dictionary
    groups[i] = data
groups.keys()

In [None]:
## Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for i, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[i] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T
norm_results_df

In [None]:
### checking sig with pandas 
norm_results_df['sig'] = norm_results_df['p'] < .05 
norm_results_df

In [None]:
# perform the correct hypothesis test
result = stats.f_oneway( *groups.values())
result

In [None]:
# Definitely significant.
values = nn['popularity']
# labels is same as in first problem

In [None]:
## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

##### That's a lot to unpack. PG has a significant popularity boost over G, NC-17, and NR. PG-13 enjoys popularity over NC-17 and NR.
##### G as a rating is in the middle and not significantly different except to PG. NC-17 and NR trend poorly, perhaps unsurprisingly.