# Project 2 Part 4
**Apply Hypothesis Testing**
-  API calls

*Christina Brockway*

## Business Problem

- Need a MySQL database on Movies from a subset of IMDB's publicly available dataset.
- Use this database to analyze what makes a movie successul
- Provide recommendations to the staakeholder on how to make a movie successful
- Create 3 senarios with the dataset
      -  Perform statistical testing to get mathematically-supported answers
      -  Report if there is a significance difference between features
          -  If yes, what was the p-value?
          -  which feature earns the most revenue?
      -  Prepare a visualization that supports findings

## Import/Load Data

In [1]:
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from tqdm.notebook import tqdm_notebook
import plotly.express as px
from sqlalchemy.engine import create_engine
from sqlalchemy_utils import database_exists, create_database
from sklearn.preprocessing import StandardScaler
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus
from sqlalchemy.types import *
import scipy.stats as stats

pd.set_option('display.max_columns', None)

In [2]:
## Load API Key
with open('/Users/csbro/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api_key'])

In [3]:
tmdb.API_KEY = login['api_key']

In [4]:
FOLDER = 'MovieData/'


In [5]:
#Mysql login
with open("/Users/csbro/.secret/mysql.json", "r") as f:
          login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [6]:
#create connection with MySQL

#Define database
dbase = 'movies'

# Define your login credentials
username = login["username"]
password = login["password"]

# Create the connection string
connection = f'mysql+pymysql://{username}:{password}@localhost/{dbase}'

# Create the database engine
engine = create_engine(connection)

# Connect to the database
engine = engine.connect()

In [7]:
# Load in data from IMDB to compare to TMDB info
basics = pd.read_csv("data/basics-filtered.csv")
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama


In [8]:
## Will use past 10 years from 2013 to 2023
GET_YEARS = list(range(2010, 2015))

#Create an empty list for errors
errors = []

In [9]:
#Define API function


def get_movie_with_rating(movie_id):
    #Get movie object using movie_id
    movie= tmdb.Movies(movie_id)
    #Save the dictionaries 
    movie_info = movie.info()
    releases = movie.releases()
    #Loop through countries for only US
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification']= c['certification']
    return movie_info



def write_json(new_data, filename):
    """Appends a list of records (new_data) into a json file (filename).
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""

    with open(filename, 'r+') as file:
        #Load existing data into dictionary
        file_data = json.load(file)
        #choose to extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        #set file's current position at offset
        file.seek(0)
        #convert back to json
        json.dump(file_data, file)

In [10]:
## Confirm APIO works
test= ["tt0848228", "tt0332280"]
results= []
for movie_id in test:
    movie_info = get_movie_with_rating(movie_id)
    results.append(movie_info)
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,134.874,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-04-25,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.711,29307,PG-13
1,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,An epic love story centered around an older ma...,65.413,/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg,"[{'id': 12, 'logo_path': '/mevhneWSqbjU22D1MXN...","[{'iso_3166_1': 'US', 'name': 'United States o...",2004-06-25,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.881,10709,PG-13


In [None]:
##OUTER LOOP
for YEAR in tqdm_notebook(GET_YEARS, desc='YEARS', position=0):
  
    #Prepare DF for json file
    JSON_MOVIE= f'{FOLDER}tmdb_api_results {YEAR}.json'
        #Check if file exists
    file_exists = os.path.isfile(JSON_MOVIE)
    
    if file_exists == False:
        print(f'Creating json file for API results for {YEAR}')
        with open(JSON_MOVIE, 'w') as f:
            json.dump([{'imdb_id':0}], f)
    else: 
        print(f'{JSON_MOVIE} already exists.')
    
    #Save dataframe
    df = basics.loc[basics['startYear'] == YEAR].copy()
    #saving movie_id to separate variable
    movie_ids = df['tconst'].copy() #.to_list()

    #Load exisiting data from json into DF called previous_df
    previous_df = pd.read_json(JSON_MOVIE)

    #filter out any ids that are already in the file
    needed_mids = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #INNER LOOP
    for movie_id in tqdm_notebook(needed_mids,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            #Append/Extend results to json file
            write_json(temp, JSON_MOVIE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])

    print(f' - Total Errors: {len(errors)}')    


    final_year_df = pd.read_json(JSON_MOVIE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression= 'gzip', index=False)

YEARS:   0%|          | 0/5 [00:00<?, ?it/s]

MovieData/tmdb_api_results 2010.json already exists.


Movies from 2010:   0%|          | 0/1360 [00:00<?, ?it/s]

 - Total Errors: 1124
Creating json file for API results for 2011


Movies from 2011:   0%|          | 0/4229 [00:00<?, ?it/s]

 - Total Errors: 2319
Creating json file for API results for 2012


Movies from 2012:   0%|          | 0/4522 [00:00<?, ?it/s]

### First Senario:

##### Does the MPAA rating of a movie affect how much revenue the movie generates?

In [None]:
df_fill= df_tmdb['certification'].fillna(value='Nan')
df_fill.value_counts()

In [None]:
df_drop=df_fill.dropna(subset=['certification', 'revenue'])

In [None]:
sns.barplot(data=df_drop, x='certification', y='revenue');

- The following features are needed to test this hypothesis:  certification and revenue
- It is numeric data
- there are multiple groups
- Use a ANOVA
  - normality
  - equal variance
  - no significant outliers

In [None]:
df_drop['certification'].value_counts()

In [None]:
df_drop['certification'].unique()

In [None]:
value_counts = df_drop['certification'].value_counts()
DROP = value_counts[value_counts <15].index
df1= df_drop[~df_drop['certification'].isin(DROP)]
df1=df1.dropna(axis=0)

##adapted from: https://www.geeksforgeeks.org/drop-rows-from-the-dataframe-based-on-certain-condition-applied-on-a-column/

In [None]:
df1['certification'].value_counts()

In [None]:
#Create groups dictionary
groups ={}

#Loop through all unique categories
for certification in df1['certification'].unique():
    data = df1.loc[df1['certification']==certification,'revenue'].copy()

#save into dictionary
    groups[certification]=data
groups.keys()

In [None]:
#Loop through the groups to get rid of outliers
groups_clean={}

for group, data in groups.items():
    outliers=np.abs(stats.zscore(data))>3
    n_outliers=np.sum(outliers)

    print(f" - For {group}, there were {n_outliers} outliers removed.")
    clean_data = data[~outliers]

    #Save into clean dictionary
    groups_clean[group] = clean_data
groups_clean.keys()

In [None]:
#Test for Normality

#Run normal test on each group and confirm there are >20 in each group
norm_results = []

for group, data in groups_clean.items():
    if len(data)>=8: 
        stat, p = stats.normaltest(data)
        norm_results.append({'group':group, "n": len(data),
                        'p':p, "test stat": stat, 'significance?': p<0.05})
    else: 
        print(f'{group} does not have enough samples')
#convert to dataframe
results_df = pd.DataFrame(norm_results)
results_df

-  None of the groups are normally distributed, BUT groups are greater than n=15, so the assumption of normality can be safely disregarded.

In [None]:
## Test for Equal Variance

result= stats.levene(*groups_clean.values())
result

In [None]:
## Use an if-else to help interpret the p-value
if result.pvalue < .05:
    print(f"The groups do NOT have equal variance.")
else:
    print(f"The groups DO have equal variance.")

-  The null hypothesis of the Levene's test is that the samples DO have equal variance.
-  The p-value indicates that there is NOT equal variance
    -  Will need to rerun with Kruskal-Wallis test instead

In [None]:
# Run the kruskal wallis test
resK = stats.kruskal(*groups_clean.values())
resK

In [None]:
resK.pvalue<0.05

***The p-value is less than 0.05, so we reject the null hypothesis:***
    --  ***MPAA rating has a significant effect on revenue***

    - Post-hoc multiple comparisons test will be run to determine which groups have a significant impact on revenue

In [None]:
## Post Hoc
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
## slice a test sector
temp = groups['R']
temp

In [None]:
## make a list for saving the dataframes to
tukeys_dfs = []

## Loop through groups dict's items
for MPAA, temp in groups.items():
    
    ## make a temp_df with the data and the sector name
    temp_df = pd.DataFrame({'revenue':temp, 'certification':MPAA})
    
    ## append to tukeys_dfs
    tukeys_dfs.append(temp_df)
    
## concatenate them into 1 dataframe    
tukeys_data = pd.concat(tukeys_dfs)
tukeys_data

In [None]:
## save the values as kg_lost and the labels to the Diet
values = tukeys_data['revenue']
labels = tukeys_data['certification']

## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

In [None]:
## optional -slicing out dataframe from results
summary = tukeys_results.summary()
tukeys_df = pd.DataFrame(summary.data[1:], columns=summary.data[0])
tukeys_df

In [None]:
## make a barplot of final data to go with results
ax = sns.barplot(data=tukeys_data, x='certification', y='revenue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');

In [None]:
## Not significantly different
tukeys_df[tukeys_df['reject'] == False]

In [None]:
## also can use built-in plot tukeys_reuslts.plot_simultaneous
tukeys_results.plot_simultaneous();

### Second Senario:

##### Do movies with higher budgets have more revenue?

**Null Hypothesis:** If a movie has a higher budget, there is no difference in the revenue.

**Alternative Hypothesis:**  If a movie has a higher budget, it has a signigicant impact on revenue.

In [None]:

data = df['budget'].copy()
mean = np.mean(data)
med = np.median(data)
## Make figure and plot histogram
fig, ax = plt.subplots(figsize=(15,6))
sns.histplot(x=data, kde=True, ax=ax, stat='density')
ax.set_title('Distribution of Movie Budgets')
## Annotate mean and median
ax.axvline(mean,ls=':', color='black', lw=3, label =f"Mean: {mean:.2f}")
ax.axvline(med,ls='--', color='green', lw=3, label =f"Median: {med:.2f}")
ax.legend()



- The following features are used to test this hypothesis:  budget and revenue
- this is numeric data
-  There is  one group
-  Will use a 1 sample ttest

In [None]:
#drop null values from related columns
df2=df_tmdb.dropna(subset=['budget','revenue'])

df2.info()

##### Visualize and separate data

In [None]:
df2['budget'].describe()

In [None]:
df2['revenue'].describe()

##### Check for Outliers


In [None]:
scaler = StandardScaler()
z_budget = scaler.fit_transform(df2[['budget']])
z_budget[:3]

In [None]:
## Plot a histogram with z-scores
ax = sns.histplot(x=z_budget,stat='probability',kde=True)
ax.set_xlabel("z-Scores")
ax.set_title("Budget z-Scores");