Import libraries for data cleaning 

In [18]:
import csv
from tokenize import String

from notebooks.db_connection import get_db_engine
from notebooks.db_connection import execute_query
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re

Reading CSV file

In [19]:
df_crew = pd.read_csv(r"C:\Users\ricca\Desktop\CSV\crew.csv\crew.csv")

Top records of CSV file: crew.csv

In [20]:
df_crew

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


Checking for total dimension of the csv

In [21]:
df_crew.shape

(4720183, 3)

Rename 'id' column in 'id_crew'. "inplace = True" modifies directly the df without creating a copy

In [22]:
df_crew.rename(columns = {"id":"id_movie"} , inplace = True)


In [23]:
df_crew.columns

Index(['id_movie', 'role', 'name'], dtype='object')

Check info about column type

In [24]:
df_crew.dtypes

id_movie     int64
role        object
name        object
dtype: object

Correction of data type of the records

In [25]:
df_crew['id_movie'] = df_crew['id_movie'].astype('Int64')
df_crew['role'] = df_crew['role'].fillna(pd.NA).astype('string')
df_crew['name'] = df_crew['name'].fillna(pd.NA).astype('string')

df_crew.dtypes

id_movie             Int64
role        string[python]
name        string[python]
dtype: object

Check if there are duplicates

In [26]:
df_crew.duplicated().sum()


np.int64(1282)

In [27]:
df_crew.drop_duplicates(inplace = True)
df_crew.duplicated().sum()

np.int64(0)

Check the missing values

In [28]:
df_crew.isnull().sum()

id_movie    0
role        0
name        1
dtype: int64

Dropping records with missing value 

In [29]:
df_crew = df_crew.dropna()
df_crew.isnull().sum()

id_movie    0
role        0
name        0
dtype: int64

Remove crews where id_movie doesn't exist

In [30]:
valid_movies = pd.read_csv("movies_valid_ids.csv")
df_crew = df_crew[df_crew["id_movie"].isin(valid_movies["id_movie"])]
df_crew.shape

(4718897, 3)

Connecting to PostgreSQL database and creating the table

In [31]:
engine = get_db_engine()
sql = \
    """CREATE TABLE IF NOT EXISTS crew (
            id BIGSERIAL PRIMARY KEY,
            id_movie INTEGER CHECK (id_movie>0),
            role TEXT CHECK (role<>''),
            name TEXT CHECK (name<>'')
            );
    """
execute_query(sql)

Query Executed Successfully!


Population of the table using .to_sql function

In [32]:
df_crew.to_sql("crew", engine, if_exists="append", index=False)

897