Import libraries for data cleaning 

In [49]:
import csv
from tokenize import String

from db_connection import get_db_engine
from db_connection import execute_query
import numpy as np
import pandas as pd
import os
import re
import pandas.io.sql as psql
import psycopg2 as ps

Reading CSV file

In [50]:
df_actors = pd.read_csv(r"C:\Users\aless\Desktop\Uni\Dataset\actors.csv\actors.csv")

Top records of CSV file: actors.csv

In [51]:
df_actors

Unnamed: 0,id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie
...,...,...,...
5798445,1941596,Marc Ma,Ba Cai/巴莱
5798446,1941596,线雨轩,Tata/塔塔
5798447,1941596,Jiang Yixuan,Zuo Yila（Zoila）/佐伊拉
5798448,1941597,Hiroshi Mikami,


Checking for total dimension of the csv

In [52]:
df_actors.shape

(5798450, 3)

Rename 'id' column in 'id_actor'. "inplace = True" modifies directly the df without creating a copy

In [53]:
df_actors.rename(columns = {"id":"id_movie"} , inplace = True)


In [54]:
df_actors.columns

Index(['id_movie', 'name', 'role'], dtype='object')

Check info about column type

In [55]:
df_actors.dtypes

id_movie     int64
name        object
role        object
dtype: object

Correction of data type of the records

In [56]:
df_actors['id_movie'] = df_actors['id_movie'].astype('Int64')
df_actors['name'] = df_actors['name'].fillna(pd.NA).astype('string')
df_actors['role'] = df_actors['role'].fillna(pd.NA).astype('string')

Check if there are duplicates

In [57]:
df_actors.duplicated().sum()


np.int64(946)

In [58]:
df_actors.drop_duplicates(inplace = True)
df_actors.duplicated().sum()

np.int64(0)

Check the missing values

In [59]:
df_actors.isnull().sum()

id_movie          0
name              4
role        1361123
dtype: int64

Dropping records with missing value 

In [60]:
df_actors.fillna({"role": "Unknown"}, inplace=True)
df_actors.replace("", pd.NA, inplace=True)
df_actors.dropna(subset=["name"], inplace=True)
df_actor = df_actors.dropna()
df_actor.isnull().sum()

id_movie    0
name        0
role        0
dtype: int64

In [61]:
df_actors.shape

(5797500, 3)

Remove actors where id_movie doesn't exist

In [62]:
valid_movies = pd.read_csv("movies_valid_ids.csv")
df_actors = df_actor[df_actor["id_movie"].isin(valid_movies["id_movie"])]
df_actors.shape

(5797499, 3)

Connecting to PostgreSQL database

In [63]:
engine = get_db_engine()
sql = \
"""CREATE TABLE IF NOT EXISTS actors (
        id BIGSERIAL PRIMARY KEY,
        id_movie INTEGER CHECK (id_movie>0),
        name TEXT CHECK (name<>''),
        role TEXT CHECK (role<>'')
        );
"""
execute_query(sql)

Query Executed Successfully!


Population of the table using .to_sql function

In [None]:
df_actors.to_sql("actors", engine, if_exists="append", index=False)