# Movies Database part 2

In [1]:
import pandas as pd

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists,create_database
from urllib.parse import quote_plus as urlquote

In [2]:
import json
with open('/Users/csbro/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

## Calculate max string lengths for object columns

### Basics

In [34]:
basics=pd.read_csv('data/basic-filtered.csv')
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86972 entries, 0 to 86971
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      86972 non-null  int64  
 1   tconst          86972 non-null  object 
 2   titleType       86972 non-null  object 
 3   primaryTitle    86972 non-null  object 
 4   originalTitle   86972 non-null  object 
 5   isAdult         86972 non-null  int64  
 6   startYear       86972 non-null  float64
 7   endYear         0 non-null      float64
 8   runtimeMinutes  86972 non-null  int64  
 9   genres          86972 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 6.6+ MB


Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
3,67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,86790,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [4]:
from sqlalchemy.types import *

tconst_len = basics['tconst'].fillna("").map(len).max()
tconst_len

10

In [5]:
ttype_len = basics['titleType'].fillna("").map(len).max()
ttype_len

5

In [6]:
ptype_len = basics['primaryTitle'].fillna("").map(len).max()
ptype_len

242

In [7]:
otitle_len = basics['originalTitle'].fillna("").map(len).max()
otitle_len

242

In [8]:
genre_len = basics['genres'].fillna("").map(len).max()
genre_len

29

### Ratings

In [33]:
ratings=pd.read_csv('data/ratings-filtered.csv')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71904 entries, 0 to 71903
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     71904 non-null  int64  
 1   tconst         71904 non-null  object 
 2   averageRating  71904 non-null  float64
 3   numVotes       71904 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 2.2+ MB


Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes
0,17961,tt0035423,6.4,87153
1,40764,tt0062336,6.4,175
2,46487,tt0068865,5.4,74
3,46645,tt0069049,6.7,7754
4,63640,tt0088751,5.2,336


In [10]:
tconstr_len = Ratings['tconst'].fillna("").map(len).max()
tconstr_len

10

## Movies ERD


![png](data/movieERD.png)

In [11]:
db_name = 'movies'
conn = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/{db_name}"
engine = create_engine(conn)

In [12]:
#Check if db exists
if database_exists(conn):
    print("It exists!")
else:
    create_database(conn)
    print('Database created')

It exists!


In [13]:
#Preview names of tables
q ="""
SHOW TABLES;
"""
pd.read_sql(q,engine)

Unnamed: 0,Tables_in_movies
0,basics
1,genres
2,ratings
3,title_genres


## Preparing data and Importing

#### Basics tables

In [30]:

q = """
DESCRIBE basics;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,
1,primary_title,varchar(250),YES,,,
2,start_year,float,YES,,,
3,runtime_mins,int,YES,,,
4,created_date,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
5,updated_date,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


#### Checking columns and names

In [15]:
#checking field names
describe['Field'].values

array(['tconst', 'primary_title', 'start_year', 'runtime_mins',
       'created_date', 'updated_date'], dtype=object)

In [16]:
#checking dataframe's columns
basics.columns

Index(['Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle',
       'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [28]:
#Drop unused columns
basics.drop(columns= ['Unnamed: 0', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'genres'], 
            inplace = True, axis = 1)
basics.columns

Index(['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes'], dtype='object')

In [29]:
#Rename column so names match
rename_basics = {'primaryTitle':'primary_title', 'startYear':'start_year', 'runtimeminutes':'runtime_mins'}
basics = basics.rename(rename_basics, axis = 1)
basics.head(1)

Unnamed: 0,tconst,primary_title,start_year,runtimeMinutes
0,tt0035423,Kate & Leopold,2001.0,118


#### Review Data Types

In [31]:
#Review SQL table data types
describe [['Field', 'Type']]

Unnamed: 0,Field,Type
0,tconst,int
1,primary_title,varchar(250)
2,start_year,float
3,runtime_mins,int
4,created_date,datetime
5,updated_date,datetime


In [32]:
#Review dataframe data types
basics.dtypes

tconst             object
primary_title      object
start_year        float64
runtimeMinutes      int64
dtype: object

In [37]:
#Change tconst from object to int
basics['tconst'] = pd.CHAR(15)(basics['tconst'])

AttributeError: module 'pandas' has no attribute 'CHAR'

In [17]:
#### Genres

In [18]:

q = """
DESCRIBE genres;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_id,int,NO,PRI,,
1,genre_name,varchar(20),YES,,,


In [19]:
#### Ratings

In [20]:

q = """
DESCRIBE ratings;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,
1,avg_rating,float,YES,,,
2,num_votes,int,YES,,,
3,date_created,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
4,date_update,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [21]:
#### title_genres

In [22]:

q = """
DESCRIBE title_genres;
"""
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,
1,genre_id,int,YES,,,
