# Movies Database part 2

In [1]:
import pandas as pd

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists,create_database
from urllib.parse import quote_plus as urlquote


In [2]:
import json
with open('/Users/csbro/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

## Calculate max string lengths for object columns

### Basics

In [3]:
basics_df=pd.read_csv('Moviedata/basics-filter.csv')
basics_df.info()
basics_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86972 entries, 0 to 86971
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      86972 non-null  int64  
 1   tconst          86972 non-null  object 
 2   titleType       86972 non-null  object 
 3   primaryTitle    86972 non-null  object 
 4   originalTitle   86972 non-null  object 
 5   isAdult         86972 non-null  int64  
 6   startYear       86972 non-null  float64
 7   endYear         0 non-null      float64
 8   runtimeMinutes  86972 non-null  int64  
 9   genres          86972 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 6.6+ MB


Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
3,67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [4]:
ttype_len = basics_df['titleType'].fillna("").map(len).max()
ttype_len

5

In [5]:
ptype_len = basics_df['primaryTitle'].fillna("").map(len).max()
ptype_len

242

In [6]:
otitle_len = basics_df['originalTitle'].fillna("").map(len).max()
otitle_len

242

In [7]:
genre_len = basics_df['genres'].fillna("").map(len).max()
genre_len

29

### Ratings

In [8]:
ratings_df=pd.read_csv('MovieData/ratings-filter.csv')
ratings_df.info()
ratings_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71904 entries, 0 to 71903
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     71904 non-null  int64  
 1   tconst         71904 non-null  object 
 2   averageRating  71904 non-null  float64
 3   numVotes       71904 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 2.2+ MB


Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes
0,17961,tt0035423,6.4,87153
1,40764,tt0062336,6.4,175
2,46487,tt0068865,5.4,74
3,46645,tt0069049,6.7,7754
4,63640,tt0088751,5.2,336


In [9]:
tconstr_len = ratings_df['tconst'].fillna("").map(len).max()
tconstr_len

10

## Movies ERD


![png](data/moviesERD.png)

In [10]:
db_name = 'movies'
conn = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/{db_name}"
engine = create_engine(conn, pool_pre_ping=True)

In [11]:
#Preview names of tables
q ="""
SHOW TABLES;
"""
pd.read_sql(q,engine)

Unnamed: 0,Tables_in_movies
0,basics
1,genres
2,ratings
3,title_genres
4,tmdb_data


In [12]:
#Check if db exists
if database_exists(conn):
    print("It exists!")
else:
    create_database(conn)
    print('Database created')

It exists!


## Preparing data and Importing

#### Basics tables

In [13]:

q = """
DESCRIBE basics;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(15),NO,PRI,,
1,primary_title,varchar(250),YES,,,
2,start_year,float,YES,,,
3,runtime_mins,int,YES,,,
4,created_date,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
5,updated_date,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


#### Checking columns and names

In [14]:
#checking field names
describe['Field'].values

array(['tconst', 'primary_title', 'start_year', 'runtime_mins',
       'created_date', 'updated_date'], dtype=object)

In [15]:
#checking dataframe's columns
basics_df.columns

Index(['Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle',
       'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [16]:
#Drop unused columns
basics_df.drop(columns= ['Unnamed: 0', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'genres'], 
            inplace = True, axis = 1)
basics_df.columns

Index(['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes'], dtype='object')

In [17]:
#Rename column so names match
rename_basics = {'primaryTitle':'primary_title', 'startYear':'start_year', 'runtimeMinutes':'runtime_mins'}
basics_df = basics_df.rename(rename_basics, axis = 1)
basics_df.head(1)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins
0,tt0035423,Kate & Leopold,2001.0,118


#### Review Data Types

In [18]:
#Review SQL table data types
describe [['Field', 'Type']]

Unnamed: 0,Field,Type
0,tconst,char(15)
1,primary_title,varchar(250)
2,start_year,float
3,runtime_mins,int
4,created_date,datetime
5,updated_date,datetime


In [19]:
#Review dataframe data types
basics_df.dtypes

tconst            object
primary_title     object
start_year       float64
runtime_mins       int64
dtype: object

In [21]:
#Load Basics table
basics_df.to_sql("basics", engine, index=False, if_exists='append')



86972

In [22]:
# Confirm data has been added
q = """
SELECT * FROM basics
LIMIT 5;
"""
pd.read_sql(q,engine)

Unnamed: 0,tconst,primary_title,start_year,runtime_mins
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
2,tt0068865,Lives of Performers,2016.0,90
3,tt0069049,The Other Side of the Wind,2018.0,122
4,tt0088751,The Naked Monster,2005.0,100


In [24]:
basics_df.to_csv(f"MovieData/basics_data.csv.gz", compression = 'gzip', index=False)

#### Ratings Tables

In [26]:

q = """
DESCRIBE ratings;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,text,YES,,,
1,primary_title,text,YES,,,
2,start_year,double,YES,,,
3,runtime_mins,bigint,YES,,,


#### Checking columns and names

In [27]:
#checking field names
describe['Field'].values

array(['tconst', 'primary_title', 'start_year', 'runtime_mins'],
      dtype=object)

In [28]:
ratings_df.columns

Index(['Unnamed: 0', 'tconst', 'averageRating', 'numVotes'], dtype='object')

In [29]:
#Drop unused columns
ratings_df.drop(columns= ['Unnamed: 0'], 
            inplace = True, axis = 1)
ratings_df.columns

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')

In [30]:
#Rename column so names match
rename_ratings = {'averageRating':'avg_rating', 'numVotes':'num_votes'}
ratings_df = ratings_df.rename(rename_ratings, axis = 1)
ratings_df.head(1)

Unnamed: 0,tconst,avg_rating,num_votes
0,tt0035423,6.4,87153


#### Review datatypes

In [31]:
#Review SQL table data types
describe [['Field', 'Type']]

Unnamed: 0,Field,Type
0,tconst,text
1,primary_title,text
2,start_year,double
3,runtime_mins,bigint


In [32]:
ratings_df.dtypes

tconst         object
avg_rating    float64
num_votes       int64
dtype: object

In [34]:
#Load ratings table
ratings_df.to_sql("ratings", engine, index=False, if_exists='replace')

71904

In [35]:
ratings_df.to_csv(f"MovieData/ratings_data.csv.gz", compression = 'gzip', index=False)

In [36]:
# Confirm data has been added
q = """
SELECT * FROM ratings
LIMIT 5;
"""
pd.read_sql(q,engine)

Unnamed: 0,tconst,avg_rating,num_votes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0068865,5.4,74
3,tt0069049,6.7,7754
4,tt0088751,5.2,336


### Genres

In [37]:

q = """
DESCRIBE genres;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,imdb_id,text,YES,,,
1,adult,double,YES,,,
2,backdrop_path,text,YES,,,
3,belongs_to_collection,text,YES,,,
4,budget,double,YES,,,
5,genres,text,YES,,,
6,homepage,text,YES,,,
7,id,double,YES,,,
8,original_language,text,YES,,,
9,original_title,text,YES,,,


In [38]:
# Confirm data has been added
q = """
SELECT * FROM genres
LIMIT 5;
"""
pd.read_sql(q,engine)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.32,1237.0,PG-13
1,tt0118589,0.0,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696.0,en,Glitter,...,5271666.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,0.0,4.405,132.0,PG-13
2,tt0118652,0.0,/mWxJEFRMvkG4UItYJkRDMgWQ08Y.jpg,,1000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,17140.0,en,The Attic Expeditions,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,His search for peace of mind... will leave his...,The Attic Expeditions,0.0,5.156,32.0,R
3,tt0119004,0.0,/7xrlSPGDO4CDT6IHTctDlkYxTzw.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,37857.0,en,Don's Plum,...,6297.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Tonight's Special - Group Therapy,Don's Plum,0.0,5.257,74.0,
4,tt0120681,0.0,/xo2S7gRwCvWdVqM0Swv37yA2rzw.jpg,,35000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,768.0,en,From Hell,...,74558115.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Only the legend will survive.,From Hell,0.0,6.684,2595.0,R


### Title_genres

In [39]:

q = """
DESCRIBE title_genres;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,imdb_id,text,YES,,,
1,adult,double,YES,,,
2,backdrop_path,text,YES,,,
3,belongs_to_collection,text,YES,,,
4,budget,double,YES,,,
5,genres,text,YES,,,
6,homepage,text,YES,,,
7,id,double,YES,,,
8,original_language,text,YES,,,
9,original_title,text,YES,,,


In [40]:
# Confirm data has been added
q = """
SELECT * FROM title_genres
LIMIT 5;
"""
pd.read_sql(q,engine)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.32,1237.0,PG-13
1,tt0118589,0.0,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696.0,en,Glitter,...,5271666.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,0.0,4.405,132.0,PG-13
2,tt0118652,0.0,/mWxJEFRMvkG4UItYJkRDMgWQ08Y.jpg,,1000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,17140.0,en,The Attic Expeditions,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,His search for peace of mind... will leave his...,The Attic Expeditions,0.0,5.156,32.0,R
3,tt0119004,0.0,/7xrlSPGDO4CDT6IHTctDlkYxTzw.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,37857.0,en,Don's Plum,...,6297.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Tonight's Special - Group Therapy,Don's Plum,0.0,5.257,74.0,
4,tt0120681,0.0,/xo2S7gRwCvWdVqM0Swv37yA2rzw.jpg,,35000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,768.0,en,From Hell,...,74558115.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Only the legend will survive.,From Hell,0.0,6.684,2595.0,R
