# Visualisation des données

In [25]:
# Importation des librairies nécessaires
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from soumabkar_films_sdk import MovieClient, MovieConfig
import time
import json
from collections import Counter, defaultdict
from pathlib import Path

In [26]:
# Dossiers
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [27]:
# Connexion à l'API via le SDK

config = MovieConfig(movie_base_url="https://movie-backend-cm2v.onrender.com")
client = MovieClient(config=config)

# Vérification que l'API est opérationnelle
client.health_check()

MOVIE_API_BASE_URL in MovieConfig init: https://movie-backend-cm2v.onrender.com


ReadTimeout: The read operation timed out

In [28]:
# Récupération des stats de l'API
analytics = client.get_analytics()
analytics

AnalyticsResponse(movie_count=9742, rating_count=100836, tag_count=3683, link_count=9742)

## Top 10 des genres par nombre de films

In [29]:
# Système sans mise en cache

# Initialisation du compteur de genres
genre_counter = Counter()

# Paramètres pour le batching
limit = 1000
skip = 0

while True:
    batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
    if not batch:
        break
    
    # On extrait les genres du lot et les compte
    for movie in batch:
        genres = movie.get("genres", "")
        genre_list = genres.split("|") if genres else []
        genre_counter.update(genre_list)
    
    skip += limit
    time.sleep(0.5)  # Pour respecter l’API

# Conversion du Counter en DataFrame
genre_df = pd.DataFrame(genre_counter.items(), columns=["genre", "count"])
genre_df = genre_df.sort_values("count", ascending=False).head(10)
genre_df

# Bar chart horizontal
fig = px.bar(
    genre_df,
    x="count",
    y="genre",
    title="Top 10 genres par nombre de films",
    labels={"genre": "Genre", "count": "Nombre de films"},
    color="count",
    color_continuous_scale="viridis",
    orientation='h'  # ← clé pour l'affichage horizontal
)

fig.update_layout(
    yaxis={'categoryorder':'total ascending'},  # trie du haut vers le bas
    height=500
)

fig.show()

0

0

In [None]:
####### Avec Système de mise en cache #########
api_movie_count = analytics.movie_count
print(api_movie_count)

genre_data_file = output_dir / "genre_df.parquet"
meta_file = output_dir / "meta.json"

# Lecture du fichier méta s'il existe
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    meta = {}
    cached_movie_count = 0

# Décision : utiliser le cache ou recalculer
if genre_data_file.exists() and cached_movie_count == api_movie_count:
    print("Chargement des données depuis le cache...")
    genre_df = pd.read_parquet(genre_data_file)
else:
    print("Mise à jour des données depuis l'API...")
    # Initialisation du compteur de genres
    genre_counter = Counter()

    # Paramètres pour le batching
    limit = 1000
    skip = 0

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        # On extrait les genres du lot et les compte
        for movie in batch:
            genres = movie.get("genres", "")
            genre_list = genres.split("|") if genres else []
            genre_counter.update(genre_list)

        skip += limit
        time.sleep(0.5)  # Pour respecter l’API

    # Conversion du Counter en DataFrame
    genre_df = pd.DataFrame(genre_counter.items(), columns=["genre", "count"])
    genre_df = genre_df.sort_values("count", ascending=False).head(10)

    # Sauvegarde
    genre_df.to_parquet(genre_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)

# Affichage Plotly
fig = px.bar(
    genre_df,
    x="count",
    y="genre",
    title="Top 10 genres par nombre de films",
    labels={"genre": "Genre", "count": "Nombre de films"},
    color="count",
    color_continuous_scale="viridis",
    orientation='h'
)

fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=500
)

fig.show()

9742
Chargement des données depuis le cache...


## Nombre total de films par année (basée sur le titre)

In [None]:
import re

# === Dossiers ===
#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

yearly_data_file = output_dir / "movies_by_year.parquet"
meta_file = output_dir / "meta_movies_by_year.json"

# === Récupération du nombre total de films via analytics ===
#analytics = client.get_analytics()
api_movie_count = analytics.movie_count

# === Lecture du cache s’il existe ===
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    cached_movie_count = 0

# === Utilisation du cache ou recalcul ===
if yearly_data_file.exists() and cached_movie_count == api_movie_count:
    print("Chargement des données depuis le cache...")
    df_yearly = pd.read_parquet(yearly_data_file)

else:
    print("Extraction des années depuis l’API...")

    # === Initialisation ===
    year_counter = Counter()
    skip = 0
    limit = 500
    year_pattern = re.compile(r"\((\d{4})\)$")

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for movie in batch:
            title = movie.get("title", "")
            match = year_pattern.search(title)
            if match:
                year = int(match.group(1))
                year_counter[year] += 1

        skip += limit
        time.sleep(0.5)

    # === Construction du DataFrame ===
    df_yearly = pd.DataFrame(sorted(year_counter.items()), columns=["year", "movie_count"])

    # === Sauvegarde du cache ===
    df_yearly.to_parquet(yearly_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)

# === Affichage avec Plotly ===
fig = px.bar(
    df_yearly,
    x="year",
    y="movie_count",
    title="Nombre total de films par année (basé sur le titre)",
    labels={"year": "Année", "movie_count": "Nombre de films"},
)

fig.update_layout(
    xaxis_title="Année",
    yaxis_title="Nombre de films",
    height=500
)

fig.show()

Chargement des données depuis le cache...


In [None]:
df_yearly

Unnamed: 0,year,movie_count
0,1902,1
1,1903,1
2,1908,1
3,1915,1
4,1916,4
...,...,...
101,2014,277
102,2015,274
103,2016,218
104,2017,147


## Top 20 des films par nombre d'évaluations

In [None]:
# === Dossiers ===
#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

top_movies_file = output_dir / "top_movies_by_ratings.parquet"
meta_file = output_dir / "meta_top_movies.json"

# === Récupération des métriques API ===
#analytics = client.get_analytics()
api_movie_count = analytics.movie_count
api_rating_count = analytics.rating_count

# === Vérification du cache ===
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
    cached_rating_count = meta.get("rating_count", 0)
else:
    cached_movie_count = 0
    cached_rating_count = 0

# === Utilisation du cache ou recalcul ===
if (
    top_movies_file.exists()
    and cached_movie_count == api_movie_count
    and cached_rating_count == api_rating_count
):
    print("Chargement des données depuis le cache...")
    top_movies_df = pd.read_parquet(top_movies_file)

else:
    print("Récupération des évaluations depuis l’API...")

    # === Initialisation des compteurs ===
    movie_rating_count = defaultdict(int)
    movie_rating_sum = defaultdict(float)

    # === Batching des ratings ===
    limit = 500
    skip = 0

    while True:
        batch = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for rating in batch:
            movie_id = rating["movieId"]
            score = rating["rating"]
            movie_rating_count[movie_id] += 1
            movie_rating_sum[movie_id] += score

        skip += limit
        time.sleep(0.5)

    # === Construction DataFrame des stats ===
    stats = [
        {
            "movieId": movie_id,
            "rating_count": movie_rating_count[movie_id],
            "avg_rating": movie_rating_sum[movie_id] / movie_rating_count[movie_id]
        }
        for movie_id in movie_rating_count
    ]

    stats_df = pd.DataFrame(stats)
    top_movies_df = stats_df.sort_values("rating_count", ascending=False).head(20)

    # === Ajout des titres de films via l’API ===
    movie_titles = {}
    for movie_id in top_movies_df["movieId"]:
        try:
            movie_data = client.get_movie(movie_id)
            movie_titles[movie_id] = movie_data.title
        except Exception as e:
            print(f"Erreur récupération titre movieId {movie_id} : {e}")
            movie_titles[movie_id] = f"Movie {movie_id}"

    top_movies_df["title"] = top_movies_df["movieId"].map(movie_titles)

    # === Sauvegarde dans le cache ===
    top_movies_df.to_parquet(top_movies_file, index=False)
    with open(meta_file, "w") as f:
        json.dump(
            {
                "movie_count": api_movie_count,
                "rating_count": api_rating_count
            },
            f
        )

# === Affichage avec Plotly ===
fig = px.bar(
    top_movies_df.sort_values("rating_count", ascending=True),  # Pour affichage de bas en haut
    x="rating_count",
    y="title",
    color="avg_rating",
    orientation="h",
    title="Top 20 des films par nombre d'évaluations",
    labels={
        "title": "Titre du film",
        "rating_count": "Nombre d'évaluations",
        "avg_rating": "Note moyenne"
    },
    color_continuous_scale="viridis"
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    height=700
)

fig.show()

Récupération des évaluations depuis l’API...


In [None]:
top_movies_df

Unnamed: 0,movieId,rating_count,avg_rating,title
20,356,329,4.164134,Forrest Gump (1994)
232,318,317,4.429022,"Shawshank Redemption, The (1994)"
16,296,307,4.197068,Pulp Fiction (1994)
34,593,279,4.16129,"Silence of the Lambs, The (1991)"
166,2571,278,4.192446,"Matrix, The (1999)"
15,260,251,4.231076,Star Wars: Episode IV - A New Hope (1977)
26,480,238,3.75,Jurassic Park (1993)
7,110,237,4.031646,Braveheart (1995)
478,589,224,3.970982,Terminator 2: Judgment Day (1991)
28,527,220,4.225,Schindler's List (1993)


## Top tags utilisés par les utilisateurs de la plateforme

In [None]:
#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

tag_usage_file = output_dir / "user_tag_stats.parquet"
meta_file = output_dir / "meta_users_behavior.json"

# Récupération des métriques d’API pour surveiller les changements :
#analytics = client.get_analytics()
api_rating_count = analytics.rating_count
api_tag_count = analytics.tag_count

if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
else:
    meta = {}

cached_rating_count = meta.get("rating_count", 0)
cached_tag_count = meta.get("tag_count", 0)


# Tags souvent utilisés par certains utilisateurs

if tag_usage_file.exists() and cached_tag_count == api_tag_count:
    print("Chargement du cache : tags utilisés")
    tag_df = pd.read_parquet(tag_usage_file)
else:
    print("Recalcul : tags utilisés")
    tag_counter = Counter()
    limit = 500
    skip = 0

    while True:
        batch = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break
        for tag in batch:
            tag_text = tag.get("tag", "")
            tag_counter[tag_text] += 1
        skip += limit
        time.sleep(0.5)

    tag_df = pd.DataFrame(tag_counter.items(), columns=["tag", "count"])
    tag_df = tag_df[tag_df["tag"].str.strip() != ""]
    tag_df = tag_df.sort_values("count", ascending=False).head(20)
    tag_df.to_parquet(tag_usage_file, index=False)

fig4 = px.bar(
    tag_df, x="count", y="tag", orientation="h",
    title="Top tags utilisés par les utilisateurs",
    labels={"count": "Nombre d’utilisations", "tag": "Tag"},
    color="count", color_continuous_scale="viridis"
)
fig4.update_layout(yaxis={'categoryorder': 'total ascending'})
fig4.show()

with open(meta_file, "w") as f:
    json.dump({
        "rating_count": api_rating_count,
        "tag_count": api_tag_count
    }, f)

Recalcul : tags utilisés


In [None]:
tag_df

Unnamed: 0,tag,count
0,In Netflix queue,131
1,atmospheric,36
2,thought-provoking,24
3,superhero,24
4,surreal,23
5,funny,23
6,Disney,23
7,religion,22
8,quirky,21
9,sci-fi,21


## Autres Insights sur les Tags

In [None]:
import pickle
import os

analytics_path = os.path.join(output_dir, "analytics.pkl")
tags_by_genre_path = os.path.join(output_dir, "tags_by_genre.parquet")
tags_good_rating_path = os.path.join(output_dir, "tags_good_rating.parquet")
tags_compare_path = os.path.join(output_dir, "tags_compare.parquet")

# Récupérer les statistiques actuelles de l’API
current_stats = client.get_analytics().__dict__

# Fonction utilitaire pour charger ou recalculer un cache
def use_or_generate(path, current_stats, compute_fn):
    if os.path.exists(path) and os.path.exists(analytics_path):
        with open(analytics_path, "rb") as f:
            saved_stats = pickle.load(f)
        if saved_stats == current_stats:
            return pd.read_parquet(path)

    df = compute_fn()
    df.to_parquet(path, index=False)
    with open(analytics_path, "wb") as f:
        pickle.dump(current_stats, f)
    return df

# -------------------------------
# 1. Tags les plus utilisés par genre
# -------------------------------
def compute_tags_by_genre():
    genre_tag_counter = defaultdict(Counter)

    # Chargement par lots
    skip = 0
    limit = 500
    while True:
        movies = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not movies:
            break
        movie_dict = {m["movieId"]: m["genres"].split("|") if m["genres"] else [] for m in movies}
        
        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        for tag in tags:
            genres = movie_dict.get(tag["movieId"], [])
            for genre in genres:
                genre_tag_counter[genre][tag["tag"]] += 1

        skip += limit
        time.sleep(0.5)

    records = []
    for genre, tag_counter in genre_tag_counter.items():
        for tag, count in tag_counter.items():
            records.append({"genre": genre, "tag": tag, "count": count})
    df = pd.DataFrame(records)
    df = df.sort_values(["genre", "count"], ascending=[True, False])
    return df

In [None]:
tags_by_genre_df = use_or_generate(tags_by_genre_path, current_stats, compute_tags_by_genre)
# Top 3 tags par genre
top_tags_by_genre = tags_by_genre_df.groupby("genre").apply(lambda g: g.nlargest(3, 'count')).reset_index(drop=True)
# Concatène genre + tag pour lisibilité
top_tags_by_genre["tag_label"] = top_tags_by_genre["tag"] + " (" + top_tags_by_genre["genre"] + ")"
tags_by_genre_df





Unnamed: 0,genre,tag,count
290,Action,sci-fi,4
325,Action,superhero,4
309,Action,aliens,3
342,Action,boxing,3
287,Action,classic,2
...,...,...,...
735,Western,James Fennimore Cooper,1
736,Western,music,1
737,Western,dark humor,1
738,Western,easygoing,1


In [None]:
fig = px.bar(
    top_tags_by_genre.sort_values("count"),
    x="count",
    y="tag_label",
    color="genre",
    orientation="h",
    title="Top 3 Tags les plus utilisés par Genre",
    labels={"count": "Nombre d'occurrences", "tag_label": "Tag (Genre)"},
    height=800
)
fig.update_layout(yaxis=dict(categoryorder='total ascending'))
fig.show()

In [None]:
# -------------------------------
# 2. Tags les plus fréquents dans les films bien notés (>= 4)
# -------------------------------
def compute_tags_for_good_ratings():
    good_ratings = []
    tags_by_movie = defaultdict(list)

    # Charger les ratings >= 4
    skip = 0
    limit = 500
    while True:
        ratings = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not ratings:
            break
        good_ratings += [r for r in ratings if r["rating"] >= 4]
        skip += limit
        time.sleep(0.5)

    # Associer les tags aux movieId bien notés
    movie_ids = set([r["movieId"] for r in good_ratings])
    skip = 0
    limit = 500
    while True:
        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not tags:
            break
        for tag in tags:
            if tag["movieId"] in movie_ids:
                tags_by_movie[tag["tag"]].append(tag["movieId"])
        skip += limit
        time.sleep(0.5)

    df = pd.DataFrame([(tag, len(movies)) for tag, movies in tags_by_movie.items()],
                      columns=["tag", "count"])
    df = df.sort_values("count", ascending=False).head(20)
    return df

tags_good_rating_df = use_or_generate(tags_good_rating_path, current_stats, compute_tags_for_good_ratings)
tags_good_rating_df

Unnamed: 0,tag,count
570,In Netflix queue,102
69,atmospheric,33
354,thought-provoking,23
0,funny,23
54,Disney,23
141,surreal,22
80,superhero,22
50,quirky,21
26,sci-fi,21
348,psychology,21


In [None]:
# Visualisation : Tags les plus fréquents dans films bien notés
fig2 = px.bar(
    tags_good_rating_df,
    x="count",
    y="tag",
    orientation="h",
    title="Tags les plus fréquents dans les films bien notés (note ≥ 4)",
    labels={"count": "Nombre d’occurrences", "tag": "Tag"},
    color="count",
    color_continuous_scale="viridis"
)
fig2.update_layout(yaxis={'categoryorder':'total ascending'})
fig2.show()

In [None]:
# -------------------------------
# 3. Comparaison : Tags dans films bien notés vs mal notés
# -------------------------------
def compute_tags_compare():
    tag_counter_good = Counter()
    tag_counter_bad = Counter()

    # Ratings par lots
    skip = 0
    limit = 500
    rating_map = {}
    while True:
        ratings = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not ratings:
            break
        for r in ratings:
            rating_map[r["movieId"]] = rating_map.get(r["movieId"], []) + [r["rating"]]
        skip += limit
        time.sleep(0.5)

    # Moyenne par film
    movie_avg_rating = {
        mid: sum(ratings)/len(ratings)
        for mid, ratings in rating_map.items()
    }

    # Tags par lots
    skip = 0
    while True:
        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not tags:
            break
        for tag in tags:
            avg_rating = movie_avg_rating.get(tag["movieId"])
            if avg_rating is not None:
                if avg_rating >= 4:
                    tag_counter_good[tag["tag"]] += 1
                elif avg_rating < 3:
                    tag_counter_bad[tag["tag"]] += 1
        skip += limit
        time.sleep(0.5)

    tags = set(tag_counter_good.keys()) | set(tag_counter_bad.keys())
    data = []
    for tag in tags:
        data.append({
            "tag": tag,
            "count_good": tag_counter_good.get(tag, 0),
            "count_bad": tag_counter_bad.get(tag, 0)
        })
    df = pd.DataFrame(data)
    df["total"] = df["count_good"] + df["count_bad"]
    df = df[df["total"] > 5].sort_values("total", ascending=False).head(20)
    return df

tags_compare_df = use_or_generate(tags_compare_path, current_stats, compute_tags_compare)
tags_compare_df

Unnamed: 0,tag,count_good,count_bad,total
142,In Netflix queue,54,0,54
898,atmospheric,16,0,16
121,thought-provoking,13,1,14
318,dark comedy,13,0,13
245,suspense,11,1,12
408,religion,7,4,11
914,dark,9,1,10
50,emotional,10,0,10
273,superhero,1,9,10
661,surreal,10,0,10


In [None]:
# Visualisation : Comparaison des tags
fig3 = px.bar(
    tags_compare_df.melt(id_vars="tag", value_vars=["count_good", "count_bad"],
                         var_name="Type", value_name="count"),
    x="count",
    y="tag",
    color="Type",
    barmode="group",
    title="Comparaison des Tags : Films bien notés vs mal notés",
    labels={"count": "Nombre d’occurrences", "tag": "Tag"}
)
fig3.update_layout(yaxis={'categoryorder':'total ascending'}, height=600)
fig3.show()