![png](Data/Movies-ERD.png)


In [1]:
# Importing pandas to display data
import pandas as pd
# We want all columns displayed, setting output to 50 to be safe
pd.set_option('display.max_columns',50)

In [2]:
# Now loading title basics
b_path = "Data/Basics.csv"
basics = pd.read_csv(b_path, low_memory=False)
basics.head() # Now loading title basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [3]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [4]:
primary_length = basics['primaryTitle'].fillna('').map(len).max()
primary_length

242

In [5]:
genre_length = basics['genres'].fillna('').map(len).max()
genre_length

29

In [6]:
# Good, now loading in ratings dataframe
r_path = "Data/Ratings.csv"
ratings = pd.read_csv(r_path, low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [7]:
# Changing column names to match diagram
ratings.rename(columns={'averageRating' : 'average_rating', 'numVotes' : 'number_of_votes'}, inplace=True)
ratings.head()

Unnamed: 0,tconst,average_rating,number_of_votes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [8]:
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
connection = "mysql+pymysql://root:root@localhost/movies"

In [9]:
import json
with open('/Users/Rovidicus/.secret/yelp_api.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['client-id', 'api-key'])

In [10]:
engine = create_engine(connection)

In [11]:
database_exists(connection)

True

In [12]:
# create a connection to the database with the engine
conn = engine.connect()

In [13]:
# Showing tables from sequel database
q = """SHOW TABLES;"""
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


In [14]:
# Running DESCRIBE on all tables
q = '''DESCRIBE genres'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_id,int,NO,PRI,,auto_increment
1,genre_name,varchar(45),YES,,,


In [15]:
q = '''DESCRIBE ratings'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(10),NO,PRI,,
1,average_rating,"decimal(10,0)",YES,,,
2,number_of_votes,int,YES,,,


In [16]:
q = '''DESCRIBE title_basics'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(10),NO,PRI,,
1,primary_title,varchar(256),YES,,,
2,start_year,float,YES,,,
3,runtime,int,YES,,,
4,ratings_tconst,varchar(10),YES,,,


In [17]:
q = '''DESCRIBE title_genres'''
pd.read_sql(q, conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,title_basics_tconst,varchar(10),NO,PRI,,
1,genres_genre_id,int,NO,PRI,,


In [18]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [19]:
# to make dataframe into table, we need to form dictionary
# each column needs sqlalchemy datatype
from sqlalchemy.types import *

In [20]:
# making copy of dataframe
basics_new = basics
basics_new.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"


In [21]:
# We need to change column names to fit tables
basics_new.rename(columns={'primaryTitle' : 'primary_title', 'startYear' : 'start_year', 'runtimeMinutes' : 'runtime'}, inplace=True)
basics_new = basics_new.drop(columns = ['titleType','originalTitle','isAdult','endYear','genres'])

In [22]:
basics_new.head(1)

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0035423,Kate & Leopold,2001.0,118


In [23]:
# Create a schema dictionary using SQLAlchemy datatype objects
title_basics_dtypes_dict = {
                       'tconst': VARCHAR(10), 
                       'primary_title': VARCHAR(256),
                       'start_year': FLOAT(),
                       'runtime': INT()
                       }

In [24]:
# save to sql with dtypes and index = False
# this argument will name table as 'tshirts' and replace any existing 'tshirts' table
# you could also add to table instead by putting if_exists = 'append'
basics_new.to_sql('title_basics', conn, dtype = title_basics_dtypes_dict, if_exists = 'append', index = False)

86979

In [25]:
ratings_dtypes_dict = {'average_ratings': FLOAT(),
                       'number_of_votes': INT()}

In [26]:
ratings.to_sql('ratings', conn, dtype = ratings_dtypes_dict, if_exists = 'append', index = False)

71900

In [29]:
# closing connection now that database is finished
conn.close()