# Project 2 Part 4
**Apply Hypothesis Testing**

Clean data
*Christina Brockway*

## Business Problem

- Need a MySQL database on Movies from a subset of IMDB's publicly available dataset.
- Use this database to analyze what makes a movie successul
- Provide recommendations to the staakeholder on how to make a movie successful
- Create 3 senarios with the dataset
      -  Perform statistical testing to get mathematically-supported answers
      -  Report if there is a significance difference between features
          -  If yes, what was the p-value?
          -  which feature earns the most revenue?
      -  Prepare a visualization that supports findings

### Import/Load Data

In [1]:
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from tqdm.notebook import tqdm_notebook
import plotly.express as px
from sqlalchemy.engine import create_engine
from sqlalchemy_utils import database_exists, create_database
from sklearn.preprocessing import StandardScaler
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus
from sqlalchemy.types import *
import scipy.stats as stats

pd.set_option('display.max_columns', None)

In [2]:
#Mysql login
with open("/Users/csbro/.secret/mysql.json", "r") as f:
          login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
#create connection with MySQL

#Define database
dbase = 'movies'

# Define your login credentials
username = login["username"]
password = login["password"]

# Create the connection string
connection = f'mysql+pymysql://{username}:{password}@localhost/{dbase}'

# Create the database engine
engine = create_engine(connection)

# Connect to the database
engine = engine.connect()

In [4]:
q="""
SHOW TABLES;
"""
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies
0,basics
1,genres
2,ratings
3,title_genres
4,tmdb_data


In [5]:
#Combine files with glob

import glob
q= "MovieData/*tmdb*.json"
tmdb_glob = sorted(glob.glob(q, recursive=True))
tmdb_glob

['MovieData\\tmdb_api_results 2001.json',
 'MovieData\\tmdb_api_results 2002.json',
 'MovieData\\tmdb_api_results 2010.json',
 'MovieData\\tmdb_api_results 2013.json',
 'MovieData\\tmdb_api_results 2014.json',
 'MovieData\\tmdb_api_results 2015.json',
 'MovieData\\tmdb_api_results 2016.json',
 'MovieData\\tmdb_api_results 2017.json',
 'MovieData\\tmdb_api_results 2018.json',
 'MovieData\\tmdb_api_results 2019.json',
 'MovieData\\tmdb_api_results 2020.json',
 'MovieData\\tmdb_api_results_2000.json',
 'MovieData\\tmdb_api_results_2003.json',
 'MovieData\\tmdb_api_results_2004.json',
 'MovieData\\tmdb_api_results_2005.json',
 'MovieData\\tmdb_api_results_2006.json',
 'MovieData\\tmdb_api_results_2007.json',
 'MovieData\\tmdb_api_results_2008.json',
 'MovieData\\tmdb_api_results_2009.json',
 'MovieData\\tmdb_api_results_2011.json',
 'MovieData\\tmdb_api_results_2012.json']

In [6]:
#Loading all files into dataframe
df_glob = []
for file in tmdb_glob:
    temp_df = pd.read_json(file)
    df_glob.append(temp_df)
#concat files
tmdb = pd.concat(df_glob)
tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


In [7]:
## Inspect the data
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58621 entries, 0 to 2501
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                58621 non-null  object 
 1   adult                  58600 non-null  float64
 2   backdrop_path          37983 non-null  object 
 3   belongs_to_collection  4056 non-null   object 
 4   budget                 58600 non-null  float64
 5   genres                 58600 non-null  object 
 6   homepage               58600 non-null  object 
 7   id                     58600 non-null  float64
 8   original_language      58600 non-null  object 
 9   original_title         58600 non-null  object 
 10  overview               58600 non-null  object 
 11  popularity             58600 non-null  float64
 12  poster_path            54080 non-null  object 
 13  production_companies   58600 non-null  object 
 14  production_countries   58600 non-null  object 
 15  rel

In [8]:
tmdb.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,13.382,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,"[{'id': 85, 'logo_path': None, 'name': 'Konrad...","[{'iso_3166_1': 'US', 'name': 'United States o...",2001-12-25,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.319,1238.0,PG-13


In [9]:
tmdb.isna().sum()

imdb_id                      0
adult                       21
backdrop_path            20638
belongs_to_collection    54565
budget                      21
genres                      21
homepage                    21
id                          21
original_language           21
original_title              21
overview                    21
popularity                  21
poster_path               4541
production_companies        21
production_countries        21
release_date                21
revenue                     21
runtime                     21
spoken_languages            21
status                      21
tagline                     21
title                       21
video                       21
vote_average                21
vote_count                  21
certification            23594
dtype: int64

In [10]:
tmdb = tmdb.dropna(subset =['certification', 'revenue'])
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35027 entries, 1 to 2500
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                35027 non-null  object 
 1   adult                  35027 non-null  float64
 2   backdrop_path          22348 non-null  object 
 3   belongs_to_collection  2551 non-null   object 
 4   budget                 35027 non-null  float64
 5   genres                 35027 non-null  object 
 6   homepage               35027 non-null  object 
 7   id                     35027 non-null  float64
 8   original_language      35027 non-null  object 
 9   original_title         35027 non-null  object 
 10  overview               35027 non-null  object 
 11  popularity             35027 non-null  float64
 12  poster_path            32116 non-null  object 
 13  production_companies   35027 non-null  object 
 14  production_countries   35027 non-null  object 
 15  rel

In [11]:
df_tmdb=tmdb[tmdb['revenue']>=1]
df_tmdb.tail()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
2403,tt2318440,0.0,/rzYp2GNUEJiM6AUYmQOxxEV9oop.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",,136368.0,th,เอคโค่ จิ๋วก้องโลก,Story of the adventures of three young men fro...,4.849,/pt2SbE1UHMtrX69jAqHMIfPYdXQ.jpg,"[{'id': 79046, 'logo_path': None, 'name': 'Gol...","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}, {'...",2012-08-01,1553168.0,81.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,On a mission to save the world!,Adventure Planet,0.0,5.0,12.0,PG
2407,tt2318625,0.0,/aBwbgdmsCZBZ7trw8fDznPWsCne.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,128154.0,he,Out in the Dark,Two young men — a Palestinian grad student and...,10.099,/djqEoDUTQf9kUP24A5lEdBxzoQp.jpg,"[{'id': 23972, 'logo_path': None, 'name': 'M72...","[{'iso_3166_1': 'IL', 'name': 'Israel'}]",2012-09-09,26966.0,96.0,"[{'english_name': 'Arabic', 'iso_639_1': 'ar',...",Released,Love knows no borders.,Out in the Dark,0.0,6.846,120.0,NR
2425,tt2321517,0.0,/9aaFPP5idQRbsCM3mKeWjmclniX.jpg,,6000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,102197.0,ru,Шпион,Spring 1941. Center of Moscow. Duel of the two...,3.904,/cpKkXycuMisfEXiMT2tSkcD84sN.jpg,"[{'id': 19126, 'logo_path': '/g9aqRl0i7BrVduxJ...","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",2012-04-05,4588176.0,99.0,"[{'english_name': 'Russian', 'iso_639_1': 'ru'...",Released,,The Spy,0.0,5.161,31.0,
2430,tt2322457,0.0,/cDRXnHcVzRFUtsBk7f1RDXJOLtg.jpg,,0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,132313.0,da,Gummi T,Poor Ivan Olsen is plagued by problems bullied...,3.194,/mKdtc8e1BNO3k9kGqTUoOqkJPzo.jpg,"[{'id': 9327, 'logo_path': None, 'name': 'Cron...","[{'iso_3166_1': 'DK', 'name': 'Denmark'}]",2012-04-16,6900000.0,80.0,"[{'english_name': 'Danish', 'iso_639_1': 'da',...",Released,From Ordinary Zero to Extarordinary Hero,Ivan the Incredible,0.0,5.646,24.0,PG
2473,tt2330866,0.0,/h1vNqJzPCTPg9Hhfxtv2XdE6Pna.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,124157.0,ko,도둑들,A gang of South Korean thieves team up with a ...,16.526,/eYJihoMqME80tp9uJIpIlRF9RI0.jpg,"[{'id': 118358, 'logo_path': '/57u7ny0hdDRaDdw...","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}, {'...",2012-07-25,83519699.0,135.0,"[{'english_name': 'Korean', 'iso_639_1': 'ko',...",Released,All For The Money. One For The Revenge. Every ...,The Thieves,0.0,6.711,265.0,NR


## Normalize and clean data into tables

### Basics

In [12]:
titles_filtered = pd.read_csv('MovieData/basics_data.csv.gz')
titles_filtered.head(2)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70


In [13]:
dfbasics = df_tmdb[['imdb_id', 'title', 'release_date', 'runtime']]
dfbasics.head(2)

Unnamed: 0,imdb_id,title,release_date,runtime
1,tt0035423,Kate & Leopold,2001-12-25,118.0
4,tt0118589,Glitter,2001-09-21,104.0


In [14]:
#split release_date into day month and year
dfbasics = dfbasics.copy()
dfbasics['release_date'] = pd.to_datetime(dfbasics['release_date'], errors='coerce')  # Convert to datetime
dfbasics['year'] = dfbasics['release_date'].dt.year
dfbasics['month'] = dfbasics['release_date'].dt.month
dfbasics['day'] = dfbasics['release_date'].dt.day
dfbasics.head(2)

#From: https://stackoverflow.com/questions/55776571/how-to-split-a-date-column-into-separate-day-month-year-column-in-pandas

Unnamed: 0,imdb_id,title,release_date,runtime,year,month,day
1,tt0035423,Kate & Leopold,2001-12-25,118.0,2001.0,12.0,25.0
4,tt0118589,Glitter,2001-09-21,104.0,2001.0,9.0,21.0


In [15]:
dfbasics= dfbasics.drop(columns=['month', 'day', 'release_date'], axis=1)
dfbasics.head(2)

Unnamed: 0,imdb_id,title,runtime,year
1,tt0035423,Kate & Leopold,118.0,2001.0
4,tt0118589,Glitter,104.0,2001.0


In [16]:
dfbasics = dfbasics.rename(columns= {'imdb_id': "tconst", 'title':"primary_title", 'runtime': 'runtime_mins', 'year': 'start_year'})
dfbasics.head(2)

Unnamed: 0,tconst,primary_title,runtime_mins,start_year
1,tt0035423,Kate & Leopold,118.0,2001.0
4,tt0118589,Glitter,104.0,2001.0


In [17]:
dfbasics.isna().sum()

tconst           0
primary_title    0
runtime_mins     0
start_year       1
dtype: int64

In [18]:
filtered = dfbasics['tconst'].isin(titles_filtered['tconst'])
dfbasics = dfbasics[filtered]
dfbasics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6479 entries, 1 to 2473
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         6479 non-null   object 
 1   primary_title  6479 non-null   object 
 2   runtime_mins   6479 non-null   float64
 3   start_year     6478 non-null   float64
dtypes: float64(2), object(2)
memory usage: 253.1+ KB


In [19]:
try: 
    dfbasics. to_sql('basics', engine, index=False, if_exists='append')
    
except Exception as e:
    print('cannot append', e)


## From:  https://www.datacamp.com/tutorial/exception-handling-python

In [20]:
#display data in sql
q="""
SELECT * FROM basics
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins,created_date,updated_date
0,tt9894470,VFW,2019.0,92,2023-11-11 18:30:50,2023-11-11 18:30:50
1,tt9883996,Dream Horse,2021.0,113,2023-11-11 18:30:50,2023-11-11 18:30:50
2,tt9877170,Malang,2020.0,135,2023-11-11 18:30:50,2023-11-11 18:30:50
3,tt9845398,End of the Century,2019.0,84,2023-11-11 18:30:50,2023-11-11 18:30:50
4,tt9845110,Two of Us,2020.0,92,2023-11-11 18:30:50,2023-11-11 18:30:50


### Ratings

In [21]:
dfratings = df_tmdb[['imdb_id', 'vote_average', 'vote_count']]
dfratings.head(2)

Unnamed: 0,imdb_id,vote_average,vote_count
1,tt0035423,6.319,1238.0
4,tt0118589,4.405,132.0


In [22]:
dfratings = dfratings.rename(columns={'imdb_id': 'tconst',
                                     'vote_average': 'avg_rating', 'vote_count': 'num_votes'})
dfratings.head(2)

Unnamed: 0,tconst,avg_rating,num_votes
1,tt0035423,6.319,1238.0
4,tt0118589,4.405,132.0


In [23]:
filtered = dfratings['tconst'].isin(dfbasics['tconst'])
dfratings = dfratings[filtered]
dfratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6479 entries, 1 to 2473
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tconst      6479 non-null   object 
 1   avg_rating  6479 non-null   float64
 2   num_votes   6479 non-null   float64
dtypes: float64(2), object(1)
memory usage: 202.5+ KB


In [24]:
try:
    dfratings.to_sql('ratings', engine, index=False, if_exists='append')
except Exception as e:
    print('cannot append', e)

In [25]:
#display data in sql
q="""
SELECT * FROM  ratings
ORDER BY tconst DESC
LIMIT 5;
"""
pd.read_sql(q, engine)

Unnamed: 0,avg_rating,num_votes,date_created,date_update,tconst
0,5.9,206,2023-11-11 18:30:50,2023-11-11 18:30:50,tt9894470
1,7.029,103,2023-11-11 18:30:50,2023-11-11 18:30:50,tt9883996
2,7.13,46,2023-11-11 18:30:50,2023-11-11 18:30:50,tt9877170
3,7.263,112,2023-11-11 18:30:50,2023-11-11 18:30:50,tt9845398
4,7.09,144,2023-11-11 18:30:50,2023-11-11 18:30:50,tt9845110


### TMDB_data

In [26]:
dftmdb= df_tmdb[['imdb_id', 'revenue', 'budget', 'certification']]
dftmdb.head(2)

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0035423,76019048.0,48000000.0,PG-13
4,tt0118589,5271666.0,22000000.0,PG-13


In [27]:
dftmdb=dftmdb.rename(columns= {'imdb_id': "tconst"})
dftmdb.head(2)

Unnamed: 0,tconst,revenue,budget,certification
1,tt0035423,76019048.0,48000000.0,PG-13
4,tt0118589,5271666.0,22000000.0,PG-13


In [28]:
filtered = dftmdb['tconst'].isin(dfbasics['tconst'])
dftmdb = dftmdb[filtered]
dftmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6479 entries, 1 to 2473
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         6479 non-null   object 
 1   revenue        6479 non-null   float64
 2   budget         6479 non-null   float64
 3   certification  6479 non-null   object 
dtypes: float64(2), object(2)
memory usage: 253.1+ KB


In [30]:
#try:
dftmdb.to_sql('tmdb_data', engine, index=False, if_exists='append')
#except Exception as e:
   #print('cannot append', e)
    
    

6479

In [31]:
dftmdb['revenue'].value_counts()


1000000.0     12
2000000.0     11
1100000.0      9
12000000.0     8
5000000.0      7
              ..
165.0          1
1788.0         1
1063893.0      1
81705746.0     1
83519699.0     1
Name: revenue, Length: 6255, dtype: int64

In [32]:
#Display data in SQL
q="""
SELECT * FROM tmdb_data
ORDER BY tconst DESC
LIMIT 100;
"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,revenue,budget,certification
0,tt9894470,23101.0,1000000.0,NR
1,tt9883996,6435260.0,20000000.0,
2,tt9877170,11136400.0,7907530.0,
3,tt9845398,103047.0,0.0,
4,tt9845110,208723.0,0.0,NR
...,...,...,...,...
95,tt8758086,200046.0,0.0,PG-13
96,tt8753438,2996760.0,7300000.0,
97,tt8744094,7141570.0,0.0,R
98,tt8737608,4155490.0,0.0,


In [33]:
q="""
DESCRIBE tmdb_data
"""
pd.read_sql(q, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(15),NO,PRI,,
1,revenue,float,YES,,,
2,budget,float,YES,,,
3,certification,char(10),YES,,,


In [34]:
dftmdb.to_csv(f"MovieData/clean_tmdb.csv.gz", compression = 'gzip', index=False)

In [35]:
dfratings.to_csv(f"MovieData/clean_ratings.csv.gz", compression = 'gzip', index=False)

In [36]:
dfbasics.to_csv(f"MovieData/clean_basics.csv.gz", compression = 'gzip', index=False)