# Project 2 Part 4
**Apply Hypothesis Testing**


*Christina Brockway*

## Business Problem

- Need a MySQL database on Movies from a subset of IMDB's publicly available dataset.
- Use this database to analyze what makes a movie successul
- Provide recommendations to the staakeholder on how to make a movie successful
- Create 3 senarios with the dataset
      -  Perform statistical testing to get mathematically-supported answers
      -  Report if there is a significance difference between features
          -  If yes, what was the p-value?
          -  which feature earns the most revenue?
      -  Prepare a visualization that supports findings

## Import/Load Data

In [1]:
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from tqdm.notebook import tqdm_notebook

import scipy.stats as stats

In [2]:
## Load API Key
with open('/Users/csbro/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api_key'])

In [3]:
tmdb.API_KEY = login['api_key']

In [12]:
FOLDER = 'MovieData/'


In [4]:
# Load in data from IMDB to compare to TMDB info
basics = pd.read_csv("data/basics-filtered.csv")
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama


In [14]:
## Will use past 10 years from 2013 to 2023
GET_YEARS = list(range(2017, 2021))

#Create an empty list for errors
errors = []

In [15]:
#Define API function


def get_movie_with_rating(movie_id):
    #Get movie object using movie_id
    movie= tmdb.Movies(movie_id)
    #Save the dictionaries 
    movie_info = movie.info()
    releases = movie.releases()
    #Loop through countries for only US
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification']= c['certification']
    return movie_info



def write_json(new_data, filename):
    """Appends a list of records (new_data) into a json file (filename).
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""

    with open(filename, 'r+') as file:
        #Load existing data into dictionary
        file_data = json.load(file)
        #choose to extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        #set file's current position at offset
        file.seek(0)
        #convert back to json
        json.dump(file_data, file)

In [16]:
## Confirm APIO works
test= ["tt0848228", "tt0332280"]
results= []
for movie_id in test:
    movie_info = get_movie_with_rating(movie_id)
    results.append(movie_info)
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.711,29299,PG-13
1,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.881,10701,PG-13


In [None]:
##OUTER LOOP
for YEAR in tqdm_notebook(GET_YEARS, desc='YEARS', position=0):
  
    #Prepare DF for json file
    JSON_MOVIE= f'{FOLDER}tmdb_api_results {YEAR}.json'
        #Check if file exists
    file_exists = os.path.isfile(JSON_MOVIE)
    
    if file_exists == False:
        print(f'Creating json file for API results for {YEAR}')
        with open(JSON_MOVIE, 'w') as f:
            json.dump([{'imdb_id':0}], f)
    else: 
        print(f'{JSON_MOVIE} already exists.')
    
    #Save dataframe
    df = basics.loc[basics['startYear'] == YEAR].copy()
    #saving movie_id to separate variable
    movie_ids = df['tconst'].copy() #.to_list()

    #Load exisiting data from json into DF called previous_df
    previous_df = pd.read_json(JSON_MOVIE)

    #filter out any ids that are already in the file
    needed_mids = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #INNER LOOP
    for movie_id in tqdm_notebook(needed_mids,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            #Append/Extend results to json file
            write_json(temp, JSON_MOVIE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])

    print(f' - Total Errors: {len(errors)}')    


    final_year_df = pd.read_json(JSON_MOVIE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression= 'gzip', index=False)

YEARS:   0%|          | 0/4 [00:00<?, ?it/s]

Creating json file for API results for 2017


Movies from 2017:   0%|          | 0/5643 [00:00<?, ?it/s]

 - Total Errors: 1178
Creating json file for API results for 2018


Movies from 2018:   0%|          | 0/5785 [00:00<?, ?it/s]

 - Total Errors: 2266
Creating json file for API results for 2019


Movies from 2019:   0%|          | 0/5877 [00:00<?, ?it/s]

In [None]:
#Combine files with glob

import glob
q= "MovieData/final_tmdb_data*.csv.gz"
tmdb_glob = sorted(glob.glob(q))
tmdb_glob

In [None]:
#Loading all files into dataframe
df_glob = []
for file in tmdb_glob:
    temp_df = pd.read_csv(file, index_col=0)
    df_glob.append(temp_df)
#concat files
df_tmdb = pd.concat(df_list)
df_tmdb.head(2)

In [None]:
## Inspect the data
df_tmdb.info()

In [None]:
df_tmdb.duplicated().sum()

In [None]:
df_tmdb.drop_duplicates(inplace=True)

In [None]:
#slice only features needed and rename columns
df_basics = df_tmdb[['tconst', 'titleType', 'primaryTitle', 'startYear', 'runtimeMinutes']]

rename_basics = {'imdb_id':'tconst', 'primaryTitle':'primary_title', 'startYear':'start_year', 'runtimeMinutes':'runtime_mins'}
df_basics= df_basics.rename(rename_basics, axis=1)

df_basics.head(2)

In [None]:
df_basics.isna().sum()


In [None]:
msno.matirx(df_basics)

In [None]:
df_basics.dropna(axis=1, inplace=True)

In [None]:
df_genres = df_tmdb[['genres', 'tconst']]
df_genres.head(2)

In [None]:
df_basics.isna().sum()

In [None]:
msno.matirx(df_genres)

In [None]:
df_genres.dropna(axis=1, inplace=True)

In [None]:
df_ratings = df_tmdb[['tconst', 'averageRating', 'numVotes']]

rename_ratings = {'averageRating':'avg_rating', 'numVotes':'num_votes'}
df_ratings = df_ratings.rename(rename_ratings, axis = 1)

df_ratings.head(2)

In [None]:
df_ratings.isna().sum()

In [None]:
msno.matirx(df_ratings)

In [None]:
df_ratings.dropna(axis=1, inplace=True)

In [None]:
df_tmdbdata = df_tmdb[['imdb_id', 'revenue', 'budget', 'certification']]

df_tmdb = df_tmdb.rename(columns{"imdb_id":'tconst'}, axis+1)

df_tmdbdata.head(2)

In [None]:
df_tmdbdata.isna().sum()

In [None]:
msno.matirx(df_tmdbdata)

In [None]:
df_tmdbdata.dropna(axis=1, inplace=True)

#### Normalize Data

# Genres

In [None]:
#split genres
df_genres['genres_split'] = df['genres'].str.split(',')
df_genres.head(2)

In [None]:
#Separated list of genres
exploded_genres = df_genres.explode('genres_split')
exploded_genres.head(2)

In [None]:
#### get unique genres from split
unique_genres = sorted(exploded_genres['genres_split'].unique()

title_genres = exploded_genres[['tconst', 'genres_split']]

title_genres.head(2)

In [None]:
#create genre mapper dictionary
genre_init = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_init))
genre_map

In [None]:
#Add genres_id column
title_genres['genres_id'] = title_genres['genres_split'].map(genre_map)
#Drop genres split column
title_genres = title_genres.drop(columns='genres_split')
title_genres.head(2)

#### Load data into MySQL database

In [None]:
df_basics.to_sql('basics', engine, 
               if_exists='append', index=False)

In [None]:
title_genres.to_sql('title_genres', engine, 
               if_exists="append", index=False)

In [None]:
df_genres.to_sql('genres', engine, 
               if_exists='append', index=False)

In [None]:
df_ratings.to_sql('ratings', engine, if_exits='append', index+False)

In [None]:
df_tmdb.to_sql('tmdb_data', engine, if_exists='append', index=False)

#### Verify data has been uploaded 

In [None]:
q="""
SHOW TABLES;
"""
pd.read_sql(q, engine)

In [None]:
#display \basics info
q = """
DESCRIBE basics;
"""

pd.read_sql(q, engine)

In [None]:
#confirm data has been added
q= """
SELECT * FROM basics
LIMIT 5;
"""
pd.read_sql(q, engine)

### First Senario:

Does the MPAA rating of a movie affect how much revenue the movie generates?