In [1]:
from google.colab import files
uploaded = files.upload()


Saving MoviesOnStreamingPlatforms.csv to MoviesOnStreamingPlatforms.csv


In [2]:
from google.colab import files
uploaded = files.upload()


Saving TVShowsOnStreamingPlatforms.csv to TVShowsOnStreamingPlatforms.csv


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [5]:
df_movie = pd.read_csv("MoviesOnStreamingPlatforms.csv")
df_tv = pd.read_csv("TVShowsOnStreamingPlatforms.csv")

print("Movies shape:", df_movie.shape)
print("TV Shows shape:", df_tv.shape)

df_movie.head()


Movies shape: (9515, 15)
TV Shows shape: (5368, 15)


Unnamed: 0,ID,Title,Year,Age,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Genre,Country,Language,IMDb,IMDb_ID
0,1,The Irishman,2019,18+,98/100,1,0,0,0,0,"Biography, Crime, Drama",United States,"English, Italian, Latin, Spanish, German",7.8,tt1302006
1,2,Dangal,2016,7+,97/100,1,0,0,0,0,"Action, Biography, Drama","India, United States","Hindi, English",8.3,tt5074352
2,3,David Attenborough: A Life on Our Planet,2020,7+,95/100,1,0,0,0,0,"Documentary, Biography",United Kingdom,English,8.9,tt11989890
3,4,Lagaan: Once Upon a Time in India,2001,7+,94/100,1,0,0,0,0,"Drama, Musical, Sport","India, United States","Hindi, English",8.1,tt0169102
4,5,Roma,2018,18+,94/100,1,0,0,0,0,Drama,"Mexico, United States","Spanish, Mixtec, English, Japanese, German, Fr...",7.6,tt6155172


In [6]:
def clean_streaming_data(df):

    df = df.copy()
    df.columns = df.columns.str.strip()

    if "Title" in df.columns:
        df["Title"] = df["Title"].astype(str).str.strip()

    if "Year" in df.columns:
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

    if "Age" in df.columns:
        df["Age_Min"] = (
            df["Age"]
            .astype(str)
            .str.extract(r"(\d+)")
        )
        df["Age_Min"] = pd.to_numeric(df["Age_Min"], errors="coerce")

    if "Rotten Tomatoes" in df.columns:
        df["RottenTomatoes_Score"] = (
            df["Rotten Tomatoes"]
            .astype(str)
            .str.extract(r"(\d+)")
        )
        df["RottenTomatoes_Score"] = pd.to_numeric(
            df["RottenTomatoes_Score"], errors="coerce"
        )

    platform_cols = ["Netflix", "Hulu", "Prime Video", "Disney+"]
    for col in platform_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    df = df.drop_duplicates()

    num_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(exclude=[np.number]).columns

    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())

    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    return df


In [7]:
df_movie_clean = clean_streaming_data(df_movie)
df_tv_clean = clean_streaming_data(df_tv)

print("Movies cleaned shape:", df_movie_clean.shape)
print("TV cleaned shape:", df_tv_clean.shape)

Movies cleaned shape: (9515, 17)
TV cleaned shape: (5368, 17)


In [8]:
def quick_check(df, name):
    print(f"\n---- {name} ----")
    print("Missing values total:", df.isna().sum().sum())

    if "Year" in df.columns:
        print("Year range:", int(df["Year"].min()), "-", int(df["Year"].max()))

    if "RottenTomatoes_Score" in df.columns:
        print("Rotten Tomatoes range:",
              df["RottenTomatoes_Score"].min(),
              "-",
              df["RottenTomatoes_Score"].max())

quick_check(df_movie_clean, "Movies")
quick_check(df_tv_clean, "TV Shows")



---- Movies ----
Missing values total: 0
Year range: 1914 - 2021
Rotten Tomatoes range: 10.0 - 98.0

---- TV Shows ----
Missing values total: 0
Year range: 1904 - 2021
Rotten Tomatoes range: 10 - 100


In [9]:
df_movie_clean.to_csv("Movies_Cleaned.csv", index=False)
df_tv_clean.to_csv("TVShows_Cleaned.csv", index=False)

In [10]:
from google.colab import files

files.download("Movies_Cleaned.csv")
files.download("TVShows_Cleaned.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>