# Imports

In [1]:
# Import packages
import json, os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote
pd.set_option('display.max_columns',50)

In [2]:
with open('/Users/17175/.secret/mysql.json')as f:
    login=json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/Movies"
engine = create_engine(connection)

In [7]:
basics = pd.read_csv('Data/Filtered_Title_Basics')
basics.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


# Normalizing Genres

## Getting a list of unique genres

In [9]:
## create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics.head(1)

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"


In [11]:
#explode genres to split each into a new row
## create a col with a list of genres
exploded_genres = basics.explode('genres_split')
exploded_genres.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Comedy
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Fantasy
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Romance
1,61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,Drama
2,67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama,Drama


In [12]:
#use .unique to get the unique genres and make a list
unique_genres = sorted(exploded_genres['genres_split'].unique())

## Create a genre mapper dictionary to replace string genres with integers

In [14]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

## Replace the string genres in title_genres with the new integer ids

In [15]:
## make new integer genre_id and drop string genres
exploded_genres['genre_id'] = exploded_genres['genres_split'].map(genre_map)
exploded_genres = exploded_genres.drop(columns='genres_split')

In [16]:
exploded_genres.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genre_id
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",5
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",9
0,34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",18
1,61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,7
2,67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama,7


## Create a new title_genres table

In [20]:
#save just tconst and genres_split as new df
title_genres = exploded_genres[['tconst','genre_id']].copy()
title_genres.head()

Unnamed: 0,tconst,genre_id
0,tt0035423,5
0,tt0035423,9
0,tt0035423,18
1,tt0062336,7
2,tt0069049,7


## add the new table to mysql

In [21]:
title_genres.to_sql("title_genres",engine,index=False, if_exists='append')

162600

In [22]:
q='''DESCRIBE title_genres'''
pd.read_sql(q,engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(55),NO,PRI,,
1,genre_id,int,NO,PRI,,


In [23]:
q='''SELECT *
    FROM title_genres
    LIMIT 5'''
pd.read_sql(q,engine)

Unnamed: 0,tconst,genre_id
0,tt0035423,5
1,tt0035423,9
2,tt0035423,18
3,tt0062336,7
4,tt0069049,7


## Convert the genre map dictionary into a dataframe

In [19]:
# manually ,ake dataframe with named cols from the .keys and .values
genre_lookup = pd.DataFrame({'genre_id':genre_map.values(),
                             'genre_name':genre_map.keys()})
genre_lookup.head()

Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adult
2,2,Adventure
3,3,Animation
4,4,Biography


## add the new table to mysql

In [24]:
genre_lookup.to_sql("genres",engine,index=False,if_exists='append')

25

In [25]:
q='''DESCRIBE genres'''
pd.read_sql(q,engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_id,int,NO,PRI,,
1,genre_name,varchar(55),YES,,,


In [26]:
q='''SELECT *
    FROM genres
    LIMIT 5'''
pd.read_sql(q,engine)

Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adult
2,2,Adventure
3,3,Animation
4,4,Biography


# Creating MySQL tables with a primary key using python

## Creating a data type schema for to_sql

In [30]:
tmdb = pd.read_csv('Data/tmdb_results_combined.csv.gz')

In [40]:
from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = tmdb['imdb_id'].fillna('').map(len).max()
cert_len = tmdb['certification\r'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "imdb_id": CHAR(key_len+1), 
    "budget": FLOAT(),
    "revenue": FLOAT(),
    'certification\r':CHAR(key_len+1)}

## run df.to_sql with the dtype argument

In [41]:
# Save to sql with dtype and index=False
tmdb.to_sql('tmdb_data',engine, dtype=df_schema, if_exists='replace',index=False)

ProgrammingError: (pymysql.err.ProgrammingError) (1166, "Incorrect column name 'certification\r'")
[SQL: 
CREATE TABLE tmdb_data (
	imdb_id CHAR(11), 
	adult FLOAT(53), 
	backdrop_path TEXT, 
	belongs_to_collection TEXT, 
	budget FLOAT, 
	genres TEXT, 
	homepage TEXT, 
	id FLOAT(53), 
	original_language TEXT, 
	original_title TEXT, 
	overview TEXT, 
	popularity FLOAT(53), 
	poster_path TEXT, 
	production_companies TEXT, 
	production_countries TEXT, 
	release_date TEXT, 
	revenue FLOAT, 
	runtime FLOAT(53), 
	spoken_languages TEXT, 
	status TEXT, 
	tagline TEXT, 
	title TEXT, 
	video FLOAT(53), 
	vote_average FLOAT(53), 
	vote_count FLOAT(53), 
	`certification` CHAR(11)
)

]
(Background on this error at: https://sqlalche.me/e/14/f405)

## Run the query to add primary key

In [42]:
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

ProgrammingError: (pymysql.err.ProgrammingError) (1146, "Table 'movies.tmdb_data' doesn't exist")
[SQL: ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);]
(Background on this error at: https://sqlalche.me/e/14/f405)