First I read the .ndjson file line-by-line to avoid memory issues.
Then I load a subset of 1000 shows into a dataframe for faster and easier analysis due to the fact that the dataset is big.

In [1]:
import json
import pandas as pd
sample_size = 1000
data = []

with open("data.ndjson", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= sample_size:
            break
        data.append(json.loads(line))

df = pd.DataFrame(data)

Here I am printing the columns from the datasert, the shape of it and I am also printing the missing values on eacch column in order to see what data needs to be cleaned. 

In [7]:
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

df.isnull().sum()

Columns: ['id', '_rid', '_self', '_etag', '_attachments', 'url', 'name', 'type', 'language', 'genres', 'status', 'runtime', 'averageRuntime', 'premiered', 'ended', 'officialSite', 'schedule', 'rating', 'weight', 'network', 'webChannel', 'dvdCountry', 'externals', 'image', 'summary', 'updated', '_links', '_embedded', 'seasons', 'wikipedia_url', 'wikiquote_url', 'metacritic_url', 'eztv_url', '_ts', 'wikipedia', 'wikiquotes']
Shape: (1000, 36)


id                   0
_rid                 0
_self                0
_etag                0
_attachments         0
url                  0
name                 0
type                 0
language             7
genres               0
status               0
runtime            179
averageRuntime      24
premiered           13
ended              303
officialSite       330
schedule             0
rating               0
weight               0
network            199
webChannel         796
dvdCountry        1000
externals            0
image               38
summary             17
updated              0
_links               0
_embedded            1
seasons              0
wikipedia_url      661
wikiquote_url      860
metacritic_url     588
eztv_url           359
_ts                  0
wikipedia          661
wikiquotes         907
dtype: int64

Here I am cleaning the columns that I want to use for my prediction model. 

In [8]:
from bs4 import BeautifulSoup

def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text() if isinstance(text, str) else ""

df["summary_clean"] = df["summary"].apply(clean_html)
df["rating_value"] = df["rating"].apply(lambda r: r.get("average") if isinstance(r, dict) else None)
df["release_year"] = pd.to_datetime(df["premiered"], errors="coerce").dt.year
df[["name", "genres", "rating_value", "release_year", "summary_clean"]].head()


Unnamed: 0,name,genres,rating_value,release_year,summary_clean
0,Carol Burnett & Company,[Comedy],,1979.0,"Music, songs, and comedy sketches."
1,Carla Cametti PD,[],,2009.0,This six-part Australian crime series is cente...
2,The Carol Burnett Show,"[Comedy, Music]",,1991.0,CBS brought back The Carol Burnett Show for an...
3,Carrier,[],,2008.0,A character-driven immersion in the high-stake...
4,Carnival Cravings with Anthony Anderson,[Food],,2015.0,There isn't much you can't wrap with bacon or ...


This is just a test in order to see from 1000 samples which are the most common ones

In [9]:
from collections import Counter

genre_counts = Counter()

for g_list in df["genres"].dropna():
    for genre in g_list:
        genre_counts[genre] += 1

genre_counts.most_common(10)


[('Drama', 280),
 ('Comedy', 249),
 ('Crime', 125),
 ('Adventure', 87),
 ('Action', 86),
 ('Romance', 71),
 ('Children', 62),
 ('Fantasy', 46),
 ('Thriller', 44),
 ('Anime', 40)]

In [ ]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset.csv")

# Total number of entries
total_entries = len(df)

# Count of missing or empty ratings
missing_ratings = df["rating_value"].isna().sum()

print(f"🔢 Total entries in dataset: {total_entries}")
print(f"❌ Entries with missing ratings: {missing_ratings}")
