# Project 2 Part 4
**Apply Hypothesis Testing**

Clean data
*Christina Brockway*

## Business Problem

- Need a MySQL database on Movies from a subset of IMDB's publicly available dataset.
- Use this database to analyze what makes a movie successul
- Provide recommendations to the staakeholder on how to make a movie successful
- Create 3 senarios with the dataset
      -  Perform statistical testing to get mathematically-supported answers
      -  Report if there is a significance difference between features
          -  If yes, what was the p-value?
          -  which feature earns the most revenue?
      -  Prepare a visualization that supports findings

### Import/Load Data

In [1]:
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from tqdm.notebook import tqdm_notebook
import plotly.express as px
from sqlalchemy.engine import create_engine
from sqlalchemy_utils import database_exists, create_database
from sklearn.preprocessing import StandardScaler
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus
from sqlalchemy.types import *
import scipy.stats as stats

pd.set_option('display.max_columns', None)

In [2]:
#Mysql login
with open("/Users/csbro/.secret/mysql.json", "r") as f:
          login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
#create connection with MySQL

#Define database
dbase = 'movies'

# Define your login credentials
username = login["username"]
password = login["password"]

# Create the connection string
connection = f'mysql+pymysql://{username}:{password}@localhost/{dbase}'

# Create the database engine
engine = create_engine(connection)

# Connect to the database
engine = engine.connect()

In [4]:
q="""
SHOW TABLES;
"""
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies
0,basics
1,genres
2,ratings
3,title_genres
4,tmdb_data


In [5]:
#Combine files with glob

import glob
q= "MovieData/*tmdb*.json"
tmdb_glob = sorted(glob.glob(q, recursive=True))
tmdb_glob

['MovieData\\tmdb_api_results 2001.json',
 'MovieData\\tmdb_api_results 2002.json',
 'MovieData\\tmdb_api_results 2003.json',
 'MovieData\\tmdb_api_results 2004.json',
 'MovieData\\tmdb_api_results 2005.json',
 'MovieData\\tmdb_api_results 2006.json',
 'MovieData\\tmdb_api_results 2007.json',
 'MovieData\\tmdb_api_results 2008.json',
 'MovieData\\tmdb_api_results 2009.json',
 'MovieData\\tmdb_api_results 2010.json',
 'MovieData\\tmdb_api_results 2011.json',
 'MovieData\\tmdb_api_results 2012.json',
 'MovieData\\tmdb_api_results 2013.json',
 'MovieData\\tmdb_api_results 2014.json',
 'MovieData\\tmdb_api_results 2015.json',
 'MovieData\\tmdb_api_results 2016.json',
 'MovieData\\tmdb_api_results 2017.json',
 'MovieData\\tmdb_api_results 2018.json',
 'MovieData\\tmdb_api_results 2019.json',
 'MovieData\\tmdb_api_results 2020.json']

In [6]:
#Loading all files into dataframe
df_glob = []
for file in tmdb_glob:
    temp_df = pd.read_json(file)
    df_glob.append(temp_df)
#concat files
tmdb = pd.concat(df_glob)
tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


In [7]:
## Inspect the data
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58212 entries, 0 to 3970
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                58212 non-null  object 
 1   adult                  58192 non-null  float64
 2   backdrop_path          37672 non-null  object 
 3   belongs_to_collection  3991 non-null   object 
 4   budget                 58192 non-null  float64
 5   genres                 58192 non-null  object 
 6   homepage               58192 non-null  object 
 7   id                     58192 non-null  float64
 8   original_language      58192 non-null  object 
 9   original_title         58192 non-null  object 
 10  overview               58192 non-null  object 
 11  popularity             58192 non-null  float64
 12  poster_path            53630 non-null  object 
 13  production_companies   58192 non-null  object 
 14  production_countries   58192 non-null  object 
 15  rel

In [8]:
tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


In [9]:
tmdb.isna().sum()

imdb_id                      0
adult                       20
backdrop_path            20540
belongs_to_collection    54221
budget                      20
genres                      20
homepage                    20
id                          20
original_language           20
original_title              20
overview                    20
popularity                  20
poster_path               4582
production_companies        20
production_countries        20
release_date                20
revenue                     20
runtime                     20
spoken_languages            20
status                      20
tagline                     20
title                       20
video                       20
vote_average                20
vote_count                  20
certification            23672
dtype: int64

In [10]:
tmdb = tmdb.dropna(subset =['certification', 'revenue'])
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34540 entries, 1 to 3970
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                34540 non-null  object 
 1   adult                  34540 non-null  float64
 2   backdrop_path          21984 non-null  object 
 3   belongs_to_collection  2486 non-null   object 
 4   budget                 34540 non-null  float64
 5   genres                 34540 non-null  object 
 6   homepage               34540 non-null  object 
 7   id                     34540 non-null  float64
 8   original_language      34540 non-null  object 
 9   original_title         34540 non-null  object 
 10  overview               34540 non-null  object 
 11  popularity             34540 non-null  float64
 12  poster_path            31617 non-null  object 
 13  production_companies   34540 non-null  object 
 14  production_countries   34540 non-null  object 
 15  rel

In [11]:
df_tmdb=tmdb[tmdb['revenue']>=1]
df_tmdb.tail()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
3913,tt9779516,0.0,/21Q8bzu10YF9i4O5amBkJBombYo.jpg,,12000000.0,"[{'id': 10402, 'name': 'Music'}, {'id': 18, 'n...",https://www.lionsgate.com/movies/i-still-believe,585244.0,en,I Still Believe,The true-life story of Christian music star Je...,21.05,/dqA2FCzz4OMmXLitKopzf476RVB.jpg,"[{'id': 100855, 'logo_path': None, 'name': 'Ke...","[{'iso_3166_1': 'US', 'name': 'United States o...",2020-03-12,16069730.0,115.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,One love can change your life.,I Still Believe,0.0,7.607,1094.0,PG
3925,tt9806370,0.0,/e8JvFt4wMxwkI62OCLCeytfeJrT.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,659986.0,en,The Owners,A group of friends think they found the perfec...,18.977,/zMUpzTRF4f4MkScDkUZM030H2t0.jpg,"[{'id': 92840, 'logo_path': '/4LyDs7gsAMq5sg3f...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2020-08-27,225374.0,92.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Respect your elders... or else.,The Owners,0.0,5.675,335.0,
3932,tt9820556,0.0,/nz8xWrTKZzA5A7FgxaM4kfAoO1W.jpg,,0.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,651571.0,en,Breach,A hardened mechanic must stay awake and mainta...,23.675,/13B6onhL6FzSN2KaNeQeMML05pS.jpg,"[{'id': 68628, 'logo_path': '/ez4m0Hw5puaHjNvG...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2020-12-17,39328.0,92.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Deep in space they are not alone.,Breach,0.0,4.2,593.0,R
3952,tt9877170,0.0,/3e46k8KPCs1N7luAqNDFZFzrbbE.jpg,,7907534.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 28, ...",,661043.0,hi,मलंग,"Advait visits Goa where he meets Sara, a free-...",6.154,/noi7E47L0bpu60DO1jcYAvCqfSS.jpg,"[{'id': 85334, 'logo_path': '/l5cbo6KR6aRBoVfh...","[{'iso_3166_1': 'IN', 'name': 'India'}]",2020-02-06,11136444.0,135.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,Unleash The Madness,Malang,0.0,7.13,46.0,
3954,tt9883996,0.0,/vpXeDFxCKmRtVCeWwt7ODOwYDmB.jpg,,20000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",https://bleeckerstreetmedia.com/dream-horse,603206.0,en,Dream Horse,"The inspiring true story of Dream Alliance, an...",11.105,/uF1mnSdf9EqDIm5XfODAHU6AcWC.jpg,"[{'id': 6705, 'logo_path': '/e8EXNSfwr5E9d3TR8...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2021-05-21,6435260.0,113.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Hearts will race.,Dream Horse,0.0,7.029,103.0,


## Normalize and clean data into tables

### Basics

In [12]:
titles_filtered = pd.read_csv('MovieData/basics_data.csv.gz')
titles_filtered.head(2)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70


In [13]:
dfbasics = df_tmdb[['imdb_id', 'title', 'release_date', 'runtime']]
dfbasics.head(2)

Unnamed: 0,imdb_id,title,release_date,runtime
1,tt0035423,Kate & Leopold,2001-12-25,118.0
4,tt0118589,Glitter,2001-09-21,104.0


In [14]:
#split release_date into day month and year
dfbasics = dfbasics.copy()
dfbasics['release_date'] = pd.to_datetime(dfbasics['release_date'], errors='coerce')  # Convert to datetime
dfbasics['year'] = dfbasics['release_date'].dt.year
dfbasics['month'] = dfbasics['release_date'].dt.month
dfbasics['day'] = dfbasics['release_date'].dt.day
dfbasics.head(2)

#From: https://stackoverflow.com/questions/55776571/how-to-split-a-date-column-into-separate-day-month-year-column-in-pandas

Unnamed: 0,imdb_id,title,release_date,runtime,year,month,day
1,tt0035423,Kate & Leopold,2001-12-25,118.0,2001.0,12.0,25.0
4,tt0118589,Glitter,2001-09-21,104.0,2001.0,9.0,21.0


In [15]:
dfbasics= dfbasics.drop(columns=['month', 'day', 'release_date'], axis=1)
dfbasics.head(2)

Unnamed: 0,imdb_id,title,runtime,year
1,tt0035423,Kate & Leopold,118.0,2001.0
4,tt0118589,Glitter,104.0,2001.0


In [16]:
dfbasics = dfbasics.rename(columns= {'imdb_id': "tconst", 'title':"primary_title", 'runtime': 'runtime_mins', 'year': 'start_year'})
dfbasics.head(2)

Unnamed: 0,tconst,primary_title,runtime_mins,start_year
1,tt0035423,Kate & Leopold,118.0,2001.0
4,tt0118589,Glitter,104.0,2001.0


In [17]:
dfbasics.isna().sum()

tconst           0
primary_title    0
runtime_mins     0
start_year       1
dtype: int64

In [18]:
filtered = dfbasics['tconst'].isin(titles_filtered['tconst'])
dfbasics = dfbasics[filtered]
dfbasics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6299 entries, 1 to 3954
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         6299 non-null   object 
 1   primary_title  6299 non-null   object 
 2   runtime_mins   6299 non-null   float64
 3   start_year     6298 non-null   float64
dtypes: float64(2), object(2)
memory usage: 246.1+ KB


In [19]:
try: 
    dfbasics. to_sql('basics', engine, index=False, if_exists='append')
    
except Exception as e:
    print('cannot append', e)


## From:  https://www.datacamp.com/tutorial/exception-handling-python

In [20]:
#display data in sql
q="""
SELECT * FROM basics
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins
0,tt9916362,Coven,2020.0,92
1,tt9916190,Safeguard,2020.0,95
2,tt9916170,The Rehearsal,2019.0,51
3,tt9915872,The Last White Witch,2019.0,97
4,tt9914942,Life Without Sara Amat,2019.0,74


### Ratings

In [21]:
dfratings = df_tmdb[['imdb_id', 'vote_average', 'vote_count']]
dfratings.head(2)

Unnamed: 0,imdb_id,vote_average,vote_count
1,tt0035423,6.319,1238.0
4,tt0118589,4.405,132.0


In [22]:
dfratings = dfratings.rename(columns={'imdb_id': 'tconst',
                                     'vote_average': 'avg_rating', 'vote_count': 'num_votes'})
dfratings.head(2)

Unnamed: 0,tconst,avg_rating,num_votes
1,tt0035423,6.319,1238.0
4,tt0118589,4.405,132.0


In [23]:
filtered = dfratings['tconst'].isin(dfbasics['tconst'])
dfratings = dfratings[filtered]
dfratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6299 entries, 1 to 3954
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tconst      6299 non-null   object 
 1   avg_rating  6299 non-null   float64
 2   num_votes   6299 non-null   float64
dtypes: float64(2), object(1)
memory usage: 196.8+ KB


In [24]:
try:
    dfratings.to_sql('ratings', engine, index=False, if_exists='append')
except Exception as e:
    print('cannot append', e)

In [25]:
#display data in sql
q="""
SELECT * FROM  ratings
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,avg_rating,num_votes
0,tt9916362,6.4,5422
1,tt9916362,6.4,5422
2,tt9916190,3.7,243
3,tt9916190,3.7,243
4,tt9916170,7.0,7


### TMDB_data

In [26]:
dftmdb= df_tmdb[['imdb_id', 'revenue', 'budget', 'certification']]
dftmdb.head(2)

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0035423,76019048.0,48000000.0,PG-13
4,tt0118589,5271666.0,22000000.0,PG-13


In [27]:
dftmdb=dftmdb.rename(columns= {'imdb_id': "tconst"})
dftmdb.head(2)

Unnamed: 0,tconst,revenue,budget,certification
1,tt0035423,76019048.0,48000000.0,PG-13
4,tt0118589,5271666.0,22000000.0,PG-13


In [28]:
filtered = dftmdb['tconst'].isin(dfbasics['tconst'])
dftmdb = dftmdb[filtered]
dftmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6299 entries, 1 to 3954
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         6299 non-null   object 
 1   revenue        6299 non-null   float64
 2   budget         6299 non-null   float64
 3   certification  6299 non-null   object 
dtypes: float64(2), object(2)
memory usage: 246.1+ KB


In [29]:
#try:
dftmdb.to_sql('tmdb_data', engine, index=False, if_exists='append')
#except Exception as e:
   #print('cannot append', e)
    
    

6299

In [30]:
dftmdb['revenue'].value_counts()


1000000.0      12
2000000.0      11
1100000.0       9
12000000.0      8
5000000.0       7
               ..
27640028.0      1
12235843.0      1
195702811.0     1
47301471.0      1
6435260.0       1
Name: revenue, Length: 6083, dtype: int64

In [31]:
#Display data in SQL
q="""
SELECT * FROM tmdb_data
ORDER BY tconst DESC
LIMIT 100;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,revenue,budget,certification
0,tt9894470,23101.0,1000000.0,NR
1,tt9883996,6435260.0,20000000.0,
2,tt9877170,11136444.0,7907534.0,
3,tt9845398,103047.0,0.0,
4,tt9845110,208723.0,0.0,NR
...,...,...,...,...
95,tt8772262,47969371.0,9000000.0,R
96,tt8761814,3393975.0,0.0,PG
97,tt8758086,200046.0,0.0,PG-13
98,tt8753438,2996763.0,7300000.0,


In [32]:
q="""
DESCRIBE tmdb_data
"""
pd.read_sql(q, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,text,YES,,,
1,revenue,double,YES,,,
2,budget,double,YES,,,
3,certification,text,YES,,,


In [33]:
dftmdb.to_csv(f"MovieData/clean_tmdb.csv.gz", compression = 'gzip', index=False)

In [34]:
dfratings.to_csv(f"MovieData/clean_ratings.csv.gz", compression = 'gzip', index=False)

In [35]:
dfbasics.to_csv(f"MovieData/clean_basics.csv.gz", compression = 'gzip', index=False)