## Netflix IMDB Scores Data Processing

In [115]:
from pandas_datareader import data, wb
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import kagglehub
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

### Data Cleaning

In [116]:
path = kagglehub.dataset_download("thedevastator/netflix-imdb-scores")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\lixin\.cache\kagglehub\datasets\thedevastator\netflix-imdb-scores\versions\2


In [117]:
import os

dataset_path = r"C:\Users\lixin\.cache\kagglehub\datasets\thedevastator\netflix-imdb-scores\versions\2"

files = os.listdir(dataset_path)
print(files)

['Netflix TV Shows and Movies.csv']


In [118]:
file_path = os.path.join(dataset_path, "Netflix TV Shows and Movies.csv")
df = pd.read_csv(file_path)

In [119]:
df.head()

Unnamed: 0,index,id,title,type,description,release_year,age_certification,runtime,imdb_id,imdb_score,imdb_votes
0,0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,tt0075314,8.3,795222.0
1,1,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,tt0071853,8.2,530877.0
2,2,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,tt0079470,8.0,392419.0
3,3,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,tt0070047,8.1,391942.0
4,4,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,tt0063929,8.8,72895.0


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5283 entries, 0 to 5282
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              5283 non-null   int64  
 1   id                 5283 non-null   object 
 2   title              5283 non-null   object 
 3   type               5283 non-null   object 
 4   description        5278 non-null   object 
 5   release_year       5283 non-null   int64  
 6   age_certification  2998 non-null   object 
 7   runtime            5283 non-null   int64  
 8   imdb_id            5283 non-null   object 
 9   imdb_score         5283 non-null   float64
 10  imdb_votes         5267 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 454.1+ KB


In [121]:
df.describe()

Unnamed: 0,index,release_year,runtime,imdb_score,imdb_votes
count,5283.0,5283.0,5283.0,5283.0,5267.0
mean,2641.0,2015.879992,79.199886,6.533447,23407.19
std,1525.215067,7.346098,38.915974,1.160932,87134.32
min,0.0,1953.0,0.0,1.5,5.0
25%,1320.5,2015.0,45.0,5.8,521.0
50%,2641.0,2018.0,87.0,6.6,2279.0
75%,3961.5,2020.0,106.0,7.4,10144.0
max,5282.0,2022.0,235.0,9.6,2268288.0


In [122]:
df.isnull().sum()

index                   0
id                      0
title                   0
type                    0
description             5
release_year            0
age_certification    2285
runtime                 0
imdb_id                 0
imdb_score              0
imdb_votes             16
dtype: int64

In [123]:
df['description'] = df['description'].fillna("No description available")
df['age_certification'] = df['age_certification'].fillna('Unknown')
df['imdb_votes'] = df['imdb_votes'].fillna(0)

In [124]:
df["release_date"] = pd.to_datetime(df["release_year"].astype(str) + "-01-01")
df["year_month"] = df["release_date"].dt.to_period("M")

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

genre_keywords = {
    "Action": ["fight", "battle", "war", "explosion", "combat", "hero", "chase", "attack"],
    "Comedy": ["funny", "comedy", "hilarious", "humor", "joke", "laugh"],
    "Drama": ["drama", "emotional", "tragedy", "romance", "family", "life"],
    "Sci-Fi": ["future", "alien", "space", "robot", "sci-fi", "technology", "cyber"],
    "Horror": ["horror", "ghost", "scary", "creepy", "monster", "fear", "murder", "blood"],
    "Fantasy": ["magic", "fantasy", "wizard", "dragon", "supernatural", "legend"],
    "Thriller": ["thriller", "suspense", "mystery", "detective", "crime", "spy", "chase"],
    "Romance": ["love", "relationship", "couple", "wedding", "heart"],
    "Adventure": ["adventure", "explore", "journey", "quest", "treasure"],
    "Animation": ["animation", "cartoon", "animated", "pixar", "disney"],
    "Musical": ["music", "singing", "dance", "musical"],
    "Crime": ["crime", "police", "detective", "gangster", "prison", "mafia"]
}

def extract_genre(description):
    matched_genres = []
    if pd.notna(description):
        for genre, keywords in genre_keywords.items():
            if any(word in description.lower() for word in keywords):
                matched_genres.append(genre)
    return ", ".join(matched_genres) if matched_genres else "Unknown"

df["genres"] = df["description"].apply(extract_genre)

unknown_desc = df[df["genres"] == "Unknown"]["description"].dropna()

vectorizer = TfidfVectorizer(stop_words="english", max_features=500)
X = vectorizer.fit_transform(unknown_desc)

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)

df_clusters = df[df["genres"] == "Unknown"].copy()
df_clusters["cluster"] = cluster_labels

print(df_clusters.groupby("cluster")["title"].apply(list))

cluster
0    [Stand by Me, The Texas Chainsaw Massacre: The...
1    [The Guns of Navarone, The Queen, Saladin the ...
2    [The Blue Lagoon, Professor, Charlie and the C...
3    [When Harry Met Sally..., The Quick and the De...
4    [My Fair Lady, The George McKenna Story, Quiet...
5    [Prince, Teenage Mutant Ninja Turtles, Danger ...
6    [The Exorcist, Too Young The Hero, The Ryan Wh...
7    [Tim Allen: Men Are Pigs, Tim Allen: Rewires A...
8    [Herod's Law, Code Geass: Lelouch of the Rebel...
9    [Major Dad, Sam Kinison: Breaking the Rules, P...
Name: title, dtype: object


In [126]:
cluster_to_genre = {
    0: "Action",
    1: "Comedy",
    2: "Drama",
    3: "Horror",
    4: "Sci-Fi",
    5: "Romance",
    6: "Thriller",
    7: "Adventure",
    8: "Fantasy",
    9: "Crime"
}

df.loc[df["genres"] == "Unknown", "genres"] = df_clusters["cluster"].map(cluster_to_genre)

print(df[["title", "genres"]].head())

                             title         genres
0                      Taxi Driver         Action
1  Monty Python and the Holy Grail         Action
2                    Life of Brian  Comedy, Drama
3                     The Exorcist       Thriller
4     Monty Python's Flying Circus         Comedy


In [127]:
df.head()

Unnamed: 0,index,id,title,type,description,release_year,age_certification,runtime,imdb_id,imdb_score,imdb_votes,release_date,year_month,genres
0,0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,tt0075314,8.3,795222.0,1976-01-01,1976-01,Action
1,1,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,tt0071853,8.2,530877.0,1975-01-01,1975-01,Action
2,2,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,tt0079470,8.0,392419.0,1979-01-01,1979-01,"Comedy, Drama"
3,3,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,tt0070047,8.1,391942.0,1973-01-01,1973-01,Thriller
4,4,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,tt0063929,8.8,72895.0,1969-01-01,1969-01,Comedy


In [128]:
df.isnull().sum()

index                0
id                   0
title                0
type                 0
description          0
release_year         0
age_certification    0
runtime              0
imdb_id              0
imdb_score           0
imdb_votes           0
release_date         0
year_month           0
genres               0
dtype: int64

In [None]:
df.to_csv("cleaned_Netflix_IMDB.csv", index=False)

### EDA

In [129]:
fig = px.histogram(df, x="release_year", nbins=30, title="Distribution of Release Years", labels={"release_year": "Release Year"})
fig.update_traces(marker=dict(line=dict(color="white", width=1)))
fig.show()


In [130]:
fig = px.histogram(df, x="imdb_score", nbins=20, title="IMDb Score Distribution", labels={"imdb_score": "IMDb Score"}, marginal="box")
fig.update_traces(marker=dict(line=dict(color="white", width=1)))
fig.show()

In [131]:
df_type_counts = df["type"].value_counts().reset_index()
df_type_counts.columns = ["Type", "Count"]

fig = px.bar(df_type_counts, x="Type", y="Count", title="Count of Movies vs TV Shows", labels={"Type": "Type", "Count": "Count"})
fig.show()

In [132]:
fig = px.histogram(df, x="runtime", nbins=30, title="Movie Runtime Distribution", labels={"runtime": "Runtime (minutes)"}, marginal="box")
fig.update_traces(marker=dict(line=dict(color="white", width=1)))
fig.show()

In [133]:
fig = px.scatter(df, x="runtime", y="imdb_score", title="Relationship Between Runtime and IMDb Score", labels={"runtime": "Runtime (minutes)", "imdb_score": "IMDb Score"})
fig.show()

In [134]:
avg_score_per_year = df.groupby("release_year")["imdb_score"].mean().reset_index()

fig = px.line(avg_score_per_year, x="release_year", y="imdb_score",title="Trend of IMDb Scores Over the Years",labels={"release_year": "Release Year", "imdb_score": "Average IMDb Score"})
fig.show()

In [143]:
df_exploded = df.assign(genres=df["genres"].str.split(", ")).explode("genres")

df_genres = df_exploded.groupby("genres")["imdb_score"].mean().reset_index()

fig = px.bar(df_genres, x="genres", y="imdb_score", title="Average IMDb Score by Movie Genre", labels={"genres": "Genres", "imdb_score": "Average IMDb Score"},color="imdb_score")

fig.update_layout(xaxis_tickangle=-45)

fig.show()


In [None]:
df_exploded = df.assign(genres=df["genres"].str.split(", ")).explode("genres")

df_genres_stats = df_exploded.groupby("genres")["imdb_score"].agg(["mean", "std"]).reset_index()

fig = px.bar(df_genres_stats, x="genres", y="std", 
             title="IMDb Score Standard Deviation by Genre",
             labels={"std": "Standard Deviation", "genres": "Genres"},
             color="std")

fig.show()

In [145]:
fig = px.box(df_exploded, x="genres", y="imdb_score", 
             title="IMDb Score Distribution by Genre", 
             labels={"genres": "Genres", "imdb_score": "IMDb Score"},
             color="genres")
fig.show()
