In [None]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [None]:
%pip install duckdb pandas numpy matplotlib requests python-dotenv jupysql duckdb-engine scikit-learn fastapi


# EDA for Content Based Recommendation System

In [None]:
import os
import duckdb
import pandas as pd
import matplotlib.pyplot as plt

# DuckDB database path:
# - In the Docker/Ploomber pipeline this file is created by the 'extract' task.
# - You can override it via env var DUCKDB_PATH.
_default_db = "movie_rec_system/movies_data.duckdb"
_fallback_db = "movies_data.duckdb"
DB_PATH = os.getenv("DUCKDB_PATH", _default_db if os.path.exists(_default_db) else _fallback_db)

con = duckdb.connect(DB_PATH)

def q(sql: str) -> pd.DataFrame:
    """Run a SELECT query and return a pandas DataFrame."""
    return con.sql(sql).df()


In [None]:
# Inspect schema
q("DESCRIBE movies")

In [None]:
q("DESCRIBE genres")

In [None]:
# Preview data
q("SELECT * FROM movies LIMIT 5")

In [None]:
q("SELECT * FROM genres LIMIT 5")

In [None]:
movie_genres = q("""
WITH ExpandedGenres AS (
    SELECT 
        m.id AS movie_id,
        mg.movie_genre_id,
        g.name AS genre_name
    FROM 
        (SELECT UNNEST(movies.genre_ids) AS movie_genre_id, movies.id FROM movies) AS mg
    JOIN movies m ON mg.id = m.id
    JOIN genres g ON mg.movie_genre_id = g.id
)
SELECT
    movie_id,
    STRING_AGG(genre_name, ', ') AS genre_names
FROM ExpandedGenres
GROUP BY movie_id;
""")
movie_genres.head()

## Expanded genres per movie

(Computed above as `movie_genres`.)

Movies with 0 vote_counts are usually unreleased. Filtering this out allows for recommendations for only available movies.

In [None]:
# Materialize a helper table used for EDA plots
con.execute("""CREATE TABLE IF NOT EXISTS movie_genre_data AS
WITH ExpandedGenres AS (
    SELECT 
        m.id AS movie_id,
        mg.movie_genre_id,
        g.name AS genre_name
    FROM 
        (SELECT UNNEST(movies.genre_ids) as movie_genre_id, movies.id FROM movies) AS mg
    JOIN 
        movies m ON mg.id = m.id
    JOIN 
        genres g ON mg.movie_genre_id = g.id
),
genre_names AS (
    SELECT
    movie_id,
    STRING_AGG(genre_name, ', ') AS genre_names
FROM 
    ExpandedGenres
GROUP BY 
    movie_id
)
SELECT gn.genre_names, m.id, m.original_language,
       m.overview, m.popularity, m.release_date,
       m.title, m.vote_average, m.vote_count
FROM genre_names gn
JOIN movies m
ON gn.movie_id = m.id
WHERE m.vote_count != 0""")
q("SELECT COUNT(*) AS n_rows FROM movie_genre_data")

In [None]:
df = q("SELECT * FROM movie_genre_data")
df.head()

In [None]:
# plt.figure(figsize=(10,4))
# plt.hist(df["popularity"].dropna(), bins=20)
# plt.grid(True)
# plt.title("Movie Popularity")
# plt.xlabel("Popularity")
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

In [None]:
# plt.figure(figsize=(10,4))
# plt.hist(df["vote_average"].dropna(), bins=20)
# plt.grid(True)
# plt.title("Vote Average")
# plt.xlabel("Vote Score")
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

In [None]:
# plt.figure(figsize=(10,4))
# plt.hist(df["vote_count"].dropna(), bins=20)
# plt.grid(True)
# plt.title("Movie Vote Count")
# plt.xlabel("Votes")
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

In [None]:
df["release_date"] = pd.to_datetime(df["release_date"])

df["release_year"] = df["release_date"].dt.year

movie_counts_by_year = df.groupby("release_year").size()

# plt.figure(figsize=(10, 6))
# plt.plot(movie_counts_by_year.index, movie_counts_by_year.values, marker="o")
# plt.xlabel("Release Year")
# plt.ylabel("Number of Movies Released")
# plt.title("Number of Movies Released Per Year")
# plt.tight_layout()
# plt.grid()
# plt.show()

In [None]:
df["genre_names"] = df["genre_names"].astype(str)


def get_genre_count(df):
    genre_counts = dict()

    for genres in df["genre_names"]:
        genre_list = genres.split(",")
        for genre in genre_list:
            if genre not in genre_counts:
                genre_counts[genre] = 1
            genre_counts[genre] += 1
    return genre_counts


genre_counts = get_genre_count(df)

sorted_genre_counts = dict(
    sorted(genre_counts.items(), key=lambda item: item[1], reverse=True)
)

genres = list(sorted_genre_counts.keys())
counts = list(sorted_genre_counts.values())

# plt.figure(figsize=(16, 7))
# plt.bar(genres, counts)
# plt.ylabel("Genres")
# plt.xlabel("Counts")
# plt.title("Genre Counts")
# plt.tight_layout()
# plt.xticks(rotation=60, ha="right")
# plt.grid()
# plt.show()

# Moving forward

Now that preliminary data wrangling and EDA has been accomplished, the next step would be to implement a content based recommendation system. 

We'll experiment using cosine TF-IDF on our movie's description and genres.