# Project 2 Part 4
**Apply Hypothesis Testing**


*Christina Brockway*way

## Business Problem

- Need a MySQL database on Movies from a subset of IMDB's publicly available dataset.
- Use this database to analyze what makes a movie successul
- Provide recommendations to the staakeholder on how to make a movie successful
- Create 3 senarios with the dataset
      -  Perform statistical testing to get mathematically-supported answers
      -  Report if there is a significance difference between features
          -  If yes, what was the p-value?
          -  which feature earns the most revenue?
      -  Prepare a visualization that supports findings

### Import/Load Data

In [1]:
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from tqdm.notebook import tqdm_notebook
import plotly.express as px
from sqlalchemy.engine import create_engine
from sqlalchemy_utils import database_exists, create_database
from sklearn.preprocessing import StandardScaler
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus
from sqlalchemy.types import *
import scipy.stats as stats

pd.set_option('display.max_columns', None)

In [2]:
#Mysql login
with open("/Users/csbro/.secret/mysql.json", "r") as f:
          login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
#create connection with MySQL

#Define database
dbase = 'movies'

# Define your login credentials
username = login["username"]
password = login["password"]

# Create the connection string
connection = f'mysql+pymysql://{username}:{password}@localhost/{dbase}'

# Create the database engine
engine = create_engine(connection)

# Connect to the database
engine = engine.connect()

In [4]:
q="""
SHOW TABLES;
"""
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies
0,basics
1,genres
2,ratings
3,title_genres
4,tmdb_data


In [5]:
#Combine files with glob

import glob
q= "MovieData/*tmdb*.json"
tmdb_glob = sorted(glob.glob(q, recursive=True))
tmdb_glob

['MovieData\\tmdb_api_results 2001.json',
 'MovieData\\tmdb_api_results 2002.json',
 'MovieData\\tmdb_api_results 2010.json',
 'MovieData\\tmdb_api_results 2013.json',
 'MovieData\\tmdb_api_results 2014.json',
 'MovieData\\tmdb_api_results 2015.json',
 'MovieData\\tmdb_api_results 2016.json',
 'MovieData\\tmdb_api_results 2017.json',
 'MovieData\\tmdb_api_results 2018.json',
 'MovieData\\tmdb_api_results 2019.json',
 'MovieData\\tmdb_api_results 2020.json',
 'MovieData\\tmdb_api_results_2000.json',
 'MovieData\\tmdb_api_results_2003.json',
 'MovieData\\tmdb_api_results_2004.json',
 'MovieData\\tmdb_api_results_2005.json',
 'MovieData\\tmdb_api_results_2006.json',
 'MovieData\\tmdb_api_results_2007.json',
 'MovieData\\tmdb_api_results_2008.json',
 'MovieData\\tmdb_api_results_2009.json',
 'MovieData\\tmdb_api_results_2011.json',
 'MovieData\\tmdb_api_results_2012.json']

In [6]:
#Loading all files into dataframe
df_glob = []
for file in tmdb_glob:
    temp_df = pd.read_json(file)
    df_glob.append(temp_df)
#concat files
df_tmdb = pd.concat(df_glob)
df_tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


In [7]:
## Inspect the data
df_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58621 entries, 0 to 2501
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                58621 non-null  object 
 1   adult                  58600 non-null  float64
 2   backdrop_path          37983 non-null  object 
 3   belongs_to_collection  4056 non-null   object 
 4   budget                 58600 non-null  float64
 5   genres                 58600 non-null  object 
 6   homepage               58600 non-null  object 
 7   id                     58600 non-null  float64
 8   original_language      58600 non-null  object 
 9   original_title         58600 non-null  object 
 10  overview               58600 non-null  object 
 11  popularity             58600 non-null  float64
 12  poster_path            54080 non-null  object 
 13  production_companies   58600 non-null  object 
 14  production_countries   58600 non-null  object 
 15  rel

In [8]:
df_tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


## Normalize and clean data into tables

### Basics

In [9]:
dfbasics = df_tmdb[['imdb_id', 'title', 'release_date', 'runtime']]
dfbasics.head(2)

Unnamed: 0,imdb_id,title,release_date,runtime
0,0,,,
1,tt0035423,Kate & Leopold,2001-12-25,118.0


In [10]:
#split release_date into day month and year
dfbasics['release_date'] = pd.to_datetime(dfbasics['release_date'], errors='coerce')  # Convert to datetime
dfbasics['year'] = dfbasics['release_date'].dt.year
dfbasics['month'] = dfbasics['release_date'].dt.month
dfbasics['day'] = dfbasics['release_date'].dt.day
dfbasics.head(2)

#From: https://stackoverflow.com/questions/55776571/how-to-split-a-date-column-into-separate-day-month-year-column-in-pandas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfbasics['release_date'] = pd.to_datetime(dfbasics['release_date'], errors='coerce')  # Convert to datetime
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfbasics['year'] = dfbasics['release_date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfbasics['month'] = dfbasics['release_date'].

Unnamed: 0,imdb_id,title,release_date,runtime,year,month,day
0,0,,NaT,,,,
1,tt0035423,Kate & Leopold,2001-12-25,118.0,2001.0,12.0,25.0


In [11]:
dfbasics= dfbasics.drop(columns=['month', 'day', 'release_date'], axis=1)
dfbasics.head(2)

Unnamed: 0,imdb_id,title,runtime,year
0,0,,,
1,tt0035423,Kate & Leopold,118.0,2001.0


In [12]:
dfbasics = dfbasics.rename(columns= {'imdb_id': "tconst", 'title':"primary_title", 'runtime': 'runtime_mins', 'year': 'start_year'})
dfbasics.head(2)

Unnamed: 0,tconst,primary_title,runtime_mins,start_year
0,0,,,
1,tt0035423,Kate & Leopold,118.0,2001.0


In [13]:
try: 
    dfbasics. to_sql('basics', engine, index=False, if_exists='append')
    
except Exception as e:
    print('cannot append', e)


## From:  https://www.datacamp.com/tutorial/exception-handling-python

cannot append (pymysql.err.IntegrityError) (1062, "Duplicate entry 'tt0035423' for key 'basics.PRIMARY'")
[SQL: INSERT INTO basics (tconst, primary_title, runtime_mins, start_year) VALUES (%(tconst)s, %(primary_title)s, %(runtime_mins)s, %(start_year)s)]
[parameters: ({'tconst': 0, 'primary_title': None, 'runtime_mins': None, 'start_year': None}, {'tconst': 'tt0035423', 'primary_title': 'Kate & Leopold', 'runtime_mins': 118.0, 'start_year': 2001.0}, {'tconst': 'tt0114447', 'primary_title': 'The Silent Force', 'runtime_mins': 90.0, 'start_year': 2001.0}, {'tconst': 'tt0116916', 'primary_title': 'The Dark Mist', 'runtime_mins': 101.0, 'start_year': 1996.0}, {'tconst': 'tt0118589', 'primary_title': 'Glitter', 'runtime_mins': 104.0, 'start_year': 2001.0}, {'tconst': 'tt0118652', 'primary_title': 'The Attic Expeditions', 'runtime_mins': 100.0, 'start_year': 2001.0}, {'tconst': 'tt0119004', 'primary_title': "Don's Plum", 'runtime_mins': 108.0, 'start_year': 2001.0}, {'tconst': 'tt0120166', '

In [14]:
#display data in sql
q="""
SELECT * FROM basics
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins,created_date,updated_date
0,tt9916362,Coven,2020.0,92,2023-11-05 20:16:48,2023-11-05 20:16:48
1,tt9916190,Safeguard,2020.0,95,2023-11-05 20:16:48,2023-11-05 20:16:48
2,tt9916170,The Rehearsal,2019.0,51,2023-11-05 20:16:48,2023-11-05 20:16:48
3,tt9915872,The Last White Witch,2019.0,97,2023-11-05 20:16:48,2023-11-05 20:16:48
4,tt9914942,Life Without Sara Amat,2019.0,74,2023-11-05 20:16:48,2023-11-05 20:16:48


### Ratings

In [15]:
dfratings = df_tmdb[['imdb_id', 'vote_average', 'vote_count']]
dfratings.head(2)

Unnamed: 0,imdb_id,vote_average,vote_count
0,0,,
1,tt0035423,6.319,1238.0


In [18]:
dfratings = dfratings.rename(columns={'imdb_id': 'tconst',
                                     'vote_average': 'avg_rating', 'vote_count': 'num_votes'})
dfratings.head(2)

Unnamed: 0,tconst,avg_rating,num_votes
0,0,,
1,tt0035423,6.319,1238.0


In [19]:
try:
    dfratings.to_sql('ratings', engine, index=False, if_exists='append')
except Exception as e:
    print('cannot append', e)

In [20]:
#display data in sql
q="""
SELECT * FROM  ratings
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,avg_rating,num_votes
0,tt9916362,6.715,267
1,tt9916362,6.4,5422
2,tt9916362,6.4,5422
3,tt9916190,3.7,243
4,tt9916190,3.7,243


In [21]:
dftmdb= df_tmdb[['imdb_id', 'revenue', 'budget', 'certification']]
dftmdb.head(2)

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0035423,76019048.0,48000000.0,PG-13


In [22]:
dftmdb=dftmdb.rename(columns= {'imdb_id': "tconst"})
dftmdb.head(2)

Unnamed: 0,tconst,revenue,budget,certification
0,0,,,
1,tt0035423,76019048.0,48000000.0,PG-13


In [23]:
try:
    dftmdb.to_sql('tmdb_data', engine, index=False, if_exists='append')
except Exception as e:
    print('cannot append', e)
    

cannot append (pymysql.err.OperationalError) (1054, "Unknown column 'tconst' in 'field list'")
[SQL: INSERT INTO tmdb_data (tconst, revenue, budget, certification) VALUES (%(tconst)s, %(revenue)s, %(budget)s, %(certification)s)]
[parameters: ({'tconst': 0, 'revenue': None, 'budget': None, 'certification': None}, {'tconst': 'tt0035423', 'revenue': 76019048.0, 'budget': 48000000.0, 'certification': 'PG-13'}, {'tconst': 'tt0114447', 'revenue': 0.0, 'budget': 0.0, 'certification': None}, {'tconst': 'tt0116916', 'revenue': 0.0, 'budget': 0.0, 'certification': 'PG'}, {'tconst': 'tt0118589', 'revenue': 5271666.0, 'budget': 22000000.0, 'certification': 'PG-13'}, {'tconst': 'tt0118652', 'revenue': 0.0, 'budget': 1000000.0, 'certification': 'R'}, {'tconst': 'tt0119004', 'revenue': 6297.0, 'budget': 0.0, 'certification': None}, {'tconst': 'tt0120166', 'revenue': 0.0, 'budget': 0.0, 'certification': 'NR'}  ... displaying 10 of 58621 total bound parameter sets ...  {'tconst': 'tt2336104', 'revenue'

In [24]:
#Display data in SQL
q="""
SELECT * FROM tmdb_data
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

OperationalError: (pymysql.err.OperationalError) (1054, "Unknown column 'tconst' in 'order clause'")
[SQL: 
SELECT * FROM tmdb_data
ORDER BY tconst DESC
LIMIT 5;
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [25]:
q="""
DESCRIBE tmdb_data
"""
pd.read_sql(q, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,imdb_id,char(15),YES,,,
1,revenue,float,YES,,,
2,budget,float,YES,,,
3,certification,varchar(10),YES,,,
