First we import all the neccsery module.

In [5]:
import os

import pandas as pd

We load all the raw data files in memory for data cleaning
### Data cleaning and transformation
- Convert Date Time format
- clean Nan Value
- Add Principal country instead of country

In [6]:
PATH_RAW = "../Data"
# Log all available datasets
print("All the available datasets: %s", os.listdir(PATH_RAW))

datasets_names = [
    filename
    for filename in os.listdir(PATH_RAW)
    if filename.endswith(".csv") and not filename.startswith("IMDb")
]
all_data = []
for file_name in datasets_names:
    read_pd = pd.read_csv(os.path.join(PATH_RAW, file_name))
    read_pd["channel_streaming"] = file_name.split("_")[0]
    all_data.append(read_pd)
# Log all column names for each dataset
for data in all_data:
    print(f"Columns in {data["channel_streaming"].iloc[0]} Data: \n {data.columns}")

All the available datasets: %s ['processed', 'disney_plus_titles.csv', 'netflix_titles.csv', 'amazon_prime_titles.csv']
Columns in disney Data: 
 Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'channel_streaming'],
      dtype='object')
Columns in netflix Data: 
 Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'channel_streaming'],
      dtype='object')
Columns in amazon Data: 
 Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'channel_streaming'],
      dtype='object')


In [3]:
common_nan_columns = set()
for data in all_data:
    # Get columns that have at least one NaN value
    nan_columns = data.columns[data.isnull().any(axis=0)]
    common_nan_columns.update(nan_columns)
    nan_columns_with_percentage = {
        column: data[column].isnull().mean() * 100 for column in nan_columns
    }  # Dict comprehension

    print(f"NaN Columns and Percentages in {data["channel_streaming"].iloc[0]} Data:")
    for column, percentage in nan_columns_with_percentage.items():
        print(f"\t {column}, {percentage}")
    print("\n")
print(f"Common NaN Columns Across All Data:{common_nan_columns}")

NaN Columns and Percentages in disney Data:
	 director, 32.62068965517241
	 cast, 13.10344827586207
	 country, 15.10344827586207
	 date_added, 0.20689655172413793
	 rating, 0.20689655172413793


NaN Columns and Percentages in netflix Data:
	 director, 29.908027705234474
	 cast, 9.367548540933349
	 country, 9.435676166685592
	 date_added, 0.11354604292040424
	 rating, 0.04541841716816169
	 duration, 0.034063812876121265


NaN Columns and Percentages in amazon Data:
	 director, 21.54530409598676
	 cast, 12.753413322300371
	 country, 93.04923458833264
	 date_added, 98.39677285891601
	 rating, 3.485726106743897


Common NaN Columns Across All Data:{'duration', 'director', 'date_added', 'cast', 'rating', 'country'}


It is evident that only director, cast, country, date_added, rating and duration has NaN values inside each dataset. Let's fill all the NaN values with "N/A" into the field.

In [4]:
# Fill NaN values in each dataset
for data in all_data:
    for column in common_nan_columns:
        data[column].fillna("UNKNOWN", inplace=True)

Print all the unique ratings.

In [5]:
unique_ratings = set()
for data in all_data:
    ratings = data["rating"].unique()
    unique_ratings.update(ratings)

for ratings in unique_ratings:
    print(ratings)

TV-NR
66 min
TV-PG
UR
PG-13
UNRATED
ALL
TV-Y7-FV
16
TV-Y
TV-14
84 min
TV-MA
13+
AGES_16_
PG
74 min
R
ALL_AGES
16+
UNKNOWN
18+
G
NOT_RATE
TV-Y7
NC-17
TV-G
7+
NR
AGES_18_


In [None]:
import plotly.express as px

# only netflix data
df = all_data[0]

# Remove rows where 'country' is 'unknown'
df = df[df["country"] != "UNKNOWN"]
# Split the 'country' column, stack, and count occurrences
top_countries = (
    df["country"].str.split(", ", expand=True).stack().value_counts().head(20).reset_index()
)
top_countries.columns = ["country", "size"]

# Create a treemap using Plotly Express
fig = px.treemap(
    top_countries,
    path=["country"],
    values="size",
    color_discrete_sequence=px.colors.sequential.Rainbow_r,
    title="Top 20 show producing countries",
    labels={"size": "Production Size"},
    template="plotly_dark",
)
fig.show()

We now save all the modified dataset into processed folder.

In [9]:
output_directory = os.path.join(PATH_RAW, "processed")
os.makedirs(output_directory, exist_ok=True)

for data in all_data:
    channel_name = data["channel_streaming"].iloc[0]

    data = data.drop(columns=["channel_streaming"])

    output_file_path = os.path.join(output_directory, f"{channel_name}.csv")
    data.to_csv(output_file_path, index=False)
print("Datasets saved into separate CSV files.")

Datasets saved into separate CSV files.


In [None]:
# PATH_RAW = "../Data/processed/"
# # Log all available datasets
# print("All the available datasets: %s", os.listdir(PATH_RAW))

# datasets_names = [filename for filename in os.listdir(PATH_RAW) if filename.endswith(".csv")]
# processed_data = []
# for dir_ in datasets_names:
#     read_pd = pd.read_csv(os.path.join(PATH_RAW, dir_))
#     read_pd["channel_streaming"] = dir_.split("_")[0]
#     processed_data.append(read_pd)
# # Log all column names for each dataset
# for data in processed_data:
#     print("Columns in %s Data: \n %s", data["channel_streaming"].iloc[0], data.columns)

In [None]:
# processed_data[0].head(5)

In [None]:
# for data in processed_data:
#     nan_columns = data.columns[data.isnull().any()]
#     nan_columns_with_percentage = {
#         column: data[column].isnull().mean() * 100 for column in nan_columns
#     }  # Dict comprehension

#     print("NaN Columns and Percentages in %s Data:", data["channel_streaming"].iloc[0])
#     for column, percentage in nan_columns_with_percentage.items():
#         print("\t %s: %.2f%%", column, percentage)
#     print("\n")

### Now we fill our database with the prosessed data.

In [2]:
import os


In [7]:

from dotenv import load_dotenv
from sqlalchemy import create_engine
# Get the current working directory
current_directory = os.getcwd()
# Get the parent directory
parent_directory = os.path.dirname(current_directory)
#  Load environment variables from the .env file in the parent directory
dotenv_path = os.path.join(parent_directory, ".env")
load_dotenv(dotenv_path)

DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL)
file_directory = os.path.join(PATH_RAW, "processed")
datasets_names = [filename for filename in os.listdir(file_directory) if filename.endswith(".csv")]
for dir_ in datasets_names:
    service_data = pd.read_csv(os.path.join(file_directory, dir_))
    table_name = dir_.split(".")[0]
    service_data.to_sql(table_name, con=engine, if_exists="replace", index=False)

### Now we have all the tables uploaded on our database and ready to be used.
Please check your database and see all the tables are uploaded.