Import libraries for data cleaning 

In [17]:
import csv
from tokenize import String

from notebooks.db_connection import get_db_engine
from notebooks.db_connection import execute_query
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re

Reading CSV file

In [18]:
df_movies = pd.read_csv(r"C:\Users\ricca\Desktop\CSV\movies.csv\movies.csv")

Top records of CSV file: movies.csv

In [19]:
df_movies

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


Checking for total dimension of the csv

In [20]:
df_movies.shape

(941597, 7)

Rename 'id' column in 'id_movie'. "inplace = True" modifies directly the df without creating a copy

In [21]:
df_movies.rename(columns = {"id":"id_movie"} , inplace = True)


In [22]:
df_movies.columns

Index(['id_movie', 'name', 'date', 'tagline', 'description', 'minute',
       'rating'],
      dtype='object')

Check info about column type

In [23]:
df_movies.dtypes

id_movie         int64
name            object
date           float64
tagline         object
description     object
minute         float64
rating         float64
dtype: object

Correction of data type of the records

In [24]:
df_movies['id_movie'] = df_movies['id_movie'].astype('Int64')
df_movies['name'] = df_movies['name'].fillna(pd.NA).astype('string')
df_movies['date'] = df_movies['date'].fillna(pd.NA).astype('Int64')
df_movies['tagline'] = df_movies['tagline'].fillna(pd.NA).astype('string')
df_movies['description'] = df_movies['description'].fillna(pd.NA).astype('string')
df_movies['minute'] = df_movies['minute'].fillna(pd.NA).astype('float64')
df_movies['rating'] = df_movies['rating'].fillna(pd.NA).astype('float64')
df_movies.dtypes

id_movie                Int64
name           string[python]
date                    Int64
tagline        string[python]
description    string[python]
minute                float64
rating                float64
dtype: object

Check if there are duplicates

In [25]:
df_movies.duplicated().sum()


np.int64(0)

In [26]:
df_movies.drop_duplicates(inplace = True)
df_movies.duplicated().sum()

np.int64(0)

Check the missing values

In [27]:
df_movies.isnull().sum()

id_movie            0
name               10
date            91913
tagline        802210
description    160812
minute         181570
rating         850598
dtype: int64

Dropping records with missing value 

In [28]:
df_movies.dropna(subset=['id_movie', 'name'], inplace = True)
df_movies.isnull().sum()

id_movie            0
name                0
date            91905
tagline        802200
description    160804
minute         181563
rating         850588
dtype: int64

Save a new CSV file containing only valid movie IDs
This file will be used in other notebooks to filter out invalid references in related datasets

In [29]:
df_movies[["id_movie"]].to_csv("movies_valid_ids.csv", index=False)

Connecting to PostgreSQL database and creating the table

In [30]:
engine = get_db_engine()
sql = \
    """CREATE TABLE IF NOT EXISTS movies (
            id_movie INTEGER CHECK (id_movie>=0),
            name TEXT CHECK (name<>''),
            date INTEGER CHECK (date>=1870),
            tagline TEXT CHECK (tagline<>''),
            description TEXT CHECK (description<>''),
            minute INTEGER CHECK (minute>=1),
            rating DECIMAL(3,2) CHECK (rating >=0 AND rating <=5),

            PRIMARY KEY (id_movie)
            );
    """
execute_query(sql)

✅ Query Executed Successfully!


Population of the table using .to_sql function

In [31]:
df_movies.to_sql("movies", engine, if_exists="append", index=False)

587