Import libraries for data cleaning 

In [16]:
import csv
from tokenize import String

from notebooks.db_connection import get_db_engine
from notebooks.db_connection import execute_query
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re

Reading CSV file

In [17]:
df_releases = pd.read_csv(r"C:\Users\ricca\Desktop\CSV\releases.csv\releases.csv")

Top records of CSV file: releases.csv

In [18]:
df_releases

Unnamed: 0,id,country,date,type,rating
0,1000001,Andorra,2023-07-21,Theatrical,
1,1000001,Argentina,2023-07-20,Theatrical,ATP
2,1000001,Australia,2023-07-19,Theatrical,PG
3,1000001,Australia,2023-10-01,Digital,PG
4,1000001,Austria,2023-07-20,Theatrical,
...,...,...,...,...,...
1332777,1940967,USA,1909-01-01,Theatrical,
1332778,1940968,Sweden,1908-11-11,Theatrical,
1332779,1940969,France,1902-01-01,Theatrical,
1332780,1940970,France,1902-01-01,Theatrical,


Checking for total dimension of the csv

In [19]:
df_releases.shape

(1332782, 5)

Rename 'id' column in 'id_release'. "inplace = True" modifies directly the df without creating a copy

In [20]:
df_releases.rename(columns = {"id":"id_movie"} , inplace = True)


In [21]:
df_releases.columns

Index(['id_movie', 'country', 'date', 'type', 'rating'], dtype='object')

Check info about column type

In [22]:
df_releases.dtypes

id_movie     int64
country     object
date        object
type        object
rating      object
dtype: object

Correction of data type of the records

In [23]:
df_releases['id_movie'] = df_releases['id_movie'].astype('Int64')
df_releases['country'] = df_releases['country'].fillna(pd.NA).astype('string')
df_releases["date"] = pd.to_datetime(df_releases["date"], errors="coerce")
df_releases['type'] = df_releases['type'].fillna(pd.NA).astype('string')
df_releases['rating'] = df_releases['rating'].fillna(pd.NA).astype('string')
df_releases.dtypes

id_movie             Int64
country     string[python]
date        datetime64[ns]
type        string[python]
rating      string[python]
dtype: object

Check if there are duplicates

In [24]:
df_releases.duplicated(subset=["id_movie", "country", "date", "type"]).sum()


np.int64(0)

In [25]:
df_releases.drop_duplicates(subset=["id_movie", "country", "date", "type"], inplace=True)
df_releases.duplicated().sum()

np.int64(0)

Check the missing values

In [26]:
df_releases.isnull().sum()

id_movie         0
country          0
date             0
type             0
rating      998802
dtype: int64

Dropping records with missing value 

In [27]:
df_releases.dropna(subset=['id_movie', 'country', 'date', 'type'], inplace = True)
df_releases.isnull().sum()

id_movie         0
country          0
date             0
type             0
rating      998802
dtype: int64

Remove releases where id_movie doesn't exist

In [28]:
valid_movies = pd.read_csv("movies_valid_ids.csv")
df_releases = df_releases[df_releases["id_movie"].isin(valid_movies["id_movie"])]
df_releases.shape

(1332780, 5)

Connecting to PostgreSQL database and creating the table

In [29]:
engine = get_db_engine()
sql = \
    """CREATE TABLE IF NOT EXISTS releases (
            id BIGSERIAL PRIMARY KEY,
            id_movie INTEGER CHECK (id_movie>=0),
            country TEXT CHECK (country<>''),
            date DATE,
            type TEXT CHECK (type<>''),
            rating TEXT
            );
    """
execute_query(sql)

Query Executed Successfully!


Population of the table using .to_sql function

In [30]:
df_releases.to_sql("releases", engine, if_exists="replace", index=False)

780