First we import all the neccsery module.

In [None]:
import logging
import os

import pandas as pd

We use the following cell for configuration only. Here we use logging for printing instead of print statement becasue the printing can be disabled by setting to different log level.

In [None]:
# Configure logging
logging.basicConfig(level=logging.DEBUG)
# Set up a logger
logger = logging.getLogger(__name__)

We load all the raw data files in memory for processing.

In [None]:
PATH_RAW = "../Data"
# Log all available datasets
logger.info("All the available datasets: %s", os.listdir(PATH_RAW))

datasets_names = [filename for filename in os.listdir(PATH_RAW) if filename.endswith(".csv")]
all_data = []
for dir_ in datasets_names:
    read_pd = pd.read_csv(os.path.join(PATH_RAW, dir_))
    # read_pd["channel_streaming"] = dir_.split("_")[0]
    all_data.append(read_pd)
# Log all column names for each dataset
for data in all_data:
    logger.info("Columns in %s Data: \n %s", data["channel_streaming"].iloc[0], data.columns)

In [None]:
for data in all_data:
    nan_columns = data.columns[data.isnull().any()]
    nan_columns_with_percentage = {
        column: data[column].isnull().mean() * 100 for column in nan_columns
    }  # Dict comprehension

    logger.info("NaN Columns and Percentages in %s Data:", data["channel_streaming"].iloc[0])
    for column, percentage in nan_columns_with_percentage.items():
        logger.info("\t %s: %.2f%%", column, percentage)
    logger.info("\n")

It is evident that only director, cast, country, date_added, rating and duration has NaN values inside each dataset. Let's fill all the NaN values with "N/A" into the field.

In [None]:
# Fill NaN values in each dataset
for data in all_data:
    data["cast"].fillna("unknown", inplace=True)
    data["director"].fillna("unknown", inplace=True)
    data["country"].fillna("unknown", inplace=True)
    data["date_added"].fillna("unknown", inplace=True)
    data["rating"].fillna("unknown", inplace=True)
    data["rating"] = data["rating"].str.upper()  # Use str.upper() directly
    data["duration"].fillna("unknown", inplace=True)

Print all the unique ratings.

In [None]:
# unique_ratings = set()
# for data in all_data:
#     ratings = data["rating"].unique()
#     unique_ratings.update(ratings)

# for ratings in unique_ratings:
#     print(ratings)

We now save all the modified dataset into processed folder.

In [None]:
output_directory = os.path.join(PATH_RAW, "processed")
os.makedirs(output_directory, exist_ok=True)

for data in all_data:
    channel_name = data["channel_streaming"].iloc[0]

    data = data.drop(columns=["channel_streaming"])

    output_file_path = os.path.join(output_directory, f"{channel_name}.csv")
    data.to_csv(output_file_path, index=False)
logger.info("Datasets saved into separate CSV files.")

In [None]:
# PATH_RAW = "../Data/processed/"
# # Log all available datasets
# logger.info("All the available datasets: %s", os.listdir(PATH_RAW))

# datasets_names = [filename for filename in os.listdir(PATH_RAW) if filename.endswith(".csv")]
# processed_data = []
# for dir_ in datasets_names:
#     read_pd = pd.read_csv(os.path.join(PATH_RAW, dir_))
#     read_pd["channel_streaming"] = dir_.split("_")[0]
#     processed_data.append(read_pd)
# # Log all column names for each dataset
# for data in processed_data:
#     logger.info("Columns in %s Data: \n %s", data["channel_streaming"].iloc[0], data.columns)

In [None]:
# processed_data[0].head(5)

In [None]:
# for data in processed_data:
#     nan_columns = data.columns[data.isnull().any()]
#     nan_columns_with_percentage = {
#         column: data[column].isnull().mean() * 100 for column in nan_columns
#     }  # Dict comprehension

#     logger.info("NaN Columns and Percentages in %s Data:", data["channel_streaming"].iloc[0])
#     for column, percentage in nan_columns_with_percentage.items():
#         logger.info("\t %s: %.2f%%", column, percentage)
#     logger.info("\n")

### Now we fill our database with the prosessed data.

In [None]:
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine, text

In [None]:
# Get the current working directory
current_directory = os.getcwd()
# Get the parent directory
parent_directory = os.path.dirname(current_directory)
#  Load environment variables from the .env file in the parent directory
dotenv_path = os.path.join(parent_directory, ".env")
load_dotenv(dotenv_path)

DATABASE_URL = os.getenv("DATABASE_URL")

In [None]:
engine = create_engine(DATABASE_URL)
file_directory = os.path.join(PATH_RAW, "processed")
datasets_names = [filename for filename in os.listdir(file_directory) if filename.endswith(".csv")]
for dir_ in datasets_names:
    service_data = pd.read_csv(os.path.join(file_directory, dir_))
    table_name = dir_.split(".")[0]
    service_data.to_sql(table_name, con=engine, if_exists="replace", index=False)

### Now we have all the tables uploaded on our database and ready to be used.
Please check your database and see all the tables are uploaded.