In [18]:
import googleapiclient.discovery
import pandas as pd
import time
import os
from dotenv import load_dotenv
import requests
import re

In [2]:
#récupération api clé
load_dotenv()
API_KEY = os.getenv('API_KEY')

In [3]:
df = pd.read_csv("videos.csv")

In [None]:
# exclure les vieilles videos, avant 2025, cette ligne doit aller dans nettoyage
df["published_at"] = pd.to_datetime(df["published_at"])
df = df[df["published_at"] > '2025-01-01']

In [None]:
# exclure les chaines qui totalisent moins de 10K vues
df_top_chaines = df.groupby('channel_id')['views'].sum().reset_index()
df_top_chaines = df_top_chaines[df_top_chaines['views'] > 10000]
df_top_chaines.info()

<class 'pandas.DataFrame'>
Index: 3228 entries, 0 to 6634
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   channel_id  3228 non-null   str  
 1   views       3228 non-null   int64
dtypes: int64(1), str(1)
memory usage: 75.7 KB


In [None]:
# Récupération des infos des chaines
all_channel_data = []

# On boucle sur le DataFrame par paquets de 50 IDs
for i in range(0, len(df_top_chaines), 50):
    batch_ids = df_top_chaines['channel_id'].iloc[i:i+50].tolist()
    ids_string = ",".join(batch_ids) 
    
    params = {
        "part": "snippet,contentDetails,statistics,topicDetails,status",
        "id": ids_string,
        "key": API_KEY
    }

    r = requests.get(f"https://www.googleapis.com/youtube/v3/channels", params=params)
    data = r.json()

    if "items" in data:
        for item in data["items"]:
            snippet = item.get("snippet", {})
            statistics = item.get("statistics", {})
            contentDetails = item.get("contentDetails", {})
            topicDetails = item.get("topicDetails", {})
            status = item.get("status", {})

            channel_info = {
                "id": item.get("id"),
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "country": snippet.get("country"),
                "views": int(statistics.get("viewCount", 0)),
                "subscribers": int(statistics.get("subscriberCount", 0)),
                "nb_videos": int(statistics.get("videoCount", 0)),
                "uploads_playlist": contentDetails.get("relatedPlaylists", {}).get("uploads", ""),
                "topics": topicDetails.get("topicCategories", []) 
            }
            all_channel_data.append(channel_info)

# Création du DataFrame final
df_channels = pd.DataFrame(all_channel_data)
df_channels.info()


<class 'pandas.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3220 non-null   str   
 1   title             3220 non-null   str   
 2   description       3220 non-null   str   
 3   country           2435 non-null   str   
 4   views             3220 non-null   int64 
 5   subscribers       3220 non-null   int64 
 6   nb_videos         3220 non-null   int64 
 7   uploads_playlist  3220 non-null   str   
 8   topics            3220 non-null   object
dtypes: int64(3), object(1), str(5)
memory usage: 226.5+ KB


In [None]:
# ne garder que les chaines fr, us, gb, ca
df_channels =  df_channels[(df_channels['country']=="FR")|(df_channels['country']=="US")|(df_channels['country']=="GB")|(df_channels['country']=="CA")].copy()
df_channels.info()

<class 'pandas.DataFrame'>
Index: 1463 entries, 2 to 3219
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                1463 non-null   str   
 1   title             1463 non-null   str   
 2   description       1463 non-null   str   
 3   country           1463 non-null   str   
 4   views             1463 non-null   int64 
 5   subscribers       1463 non-null   int64 
 6   nb_videos         1463 non-null   int64 
 7   uploads_playlist  1463 non-null   str   
 8   topics            1463 non-null   object
dtypes: int64(3), object(1), str(5)
memory usage: 114.3+ KB


In [36]:
# Récupération des 10 dernières vidéos des chaines
# calcul du taux d'engagement moyen sur les vidéos récentes
# calcul le mode de category id pour définir la catégorie dominante de la chaine

playlist_ids = ["UU-0DG6pPrn62aP3EDiDOXng"] 
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

def get_stats_recent_videos(playlist_list):
    all_video_data = []

    for p_id in playlist_list:
        try:
            # 1. Récupérer les 10 dernières vidéos de la playlist 
            pl_request = youtube.playlistItems().list(
                part="contentDetails",
                playlistId=p_id,
                maxResults=10
            )
            pl_response = pl_request.execute()
            
            video_ids = [item['contentDetails']['videoId'] for item in pl_response['items']]
            
            if not video_ids:
                continue

            # 2. Récupérer stats + catégorie en UN SEUL appel (Coût : 1 unité)
            v_request = youtube.videos().list(
                part="statistics,snippet",
                id=",".join(video_ids)
            )
            v_response = v_request.execute()

            for video in v_response['items']:
                all_video_data.append({
                    "playlist_id": p_id,
                    "video_id": video['id'],
                    "title": video['snippet']['title'],
                    "category_id": video['snippet']['categoryId'],
                    "views": int(video['statistics'].get('viewCount', 0)),
                    "likes": int(video['statistics'].get('likeCount', 0)),
                    "comments": int(video['statistics'].get('commentCount', 0)),
                    "published_at": video['snippet']['publishedAt']
                })
                
        except Exception as e:
            print(f"Erreur sur la playlist {p_id}: {e}")

    return all_video_data

# Exécution
playlists = df_channels['uploads_playlist'].unique().tolist()
data = get_stats_recent_videos(playlists)

# Sauvegarde propre
df_videos = pd.DataFrame(data)

Erreur sur la playlist UUOrSBBE5jDADy9mjBeSkpvA: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=UUOrSBBE5jDADy9mjBeSkpvA&maxResults=10&key=AIzaSyBckNpPCTt4fkuyX82SRkYWnXaLKLiAE_w&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". Details: "[{'message': "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.", 'domain': 'youtube.playlistItem', 'reason': 'playlistNotFound', 'location': 'playlistId', 'locationType': 'parameter'}]">


In [38]:
df_videos.info()

<class 'pandas.DataFrame'>
RangeIndex: 14553 entries, 0 to 14552
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   playlist_id   14553 non-null  str  
 1   video_id      14553 non-null  str  
 2   title         14553 non-null  str  
 3   category_id   14553 non-null  str  
 4   views         14553 non-null  int64
 5   likes         14553 non-null  int64
 6   comments      14553 non-null  int64
 7   published_at  14553 non-null  str  
dtypes: int64(3), str(5)
memory usage: 909.7 KB


In [42]:
df_categories = df_videos.groupby('playlist_id')['category_id'].agg(lambda x: x.mode().iloc[0]).reset_index()

df_stats = df_videos.groupby('playlist_id').agg({
    'views': 'sum',
    'likes': 'sum',
    'comments': 'sum',
    'video_id': 'count'
}).reset_index()



In [43]:
# 4. Fusion des deux pour avoir un tableau propre par chaîne
df_final = pd.merge(df_categories, df_stats, on='playlist_id')
df_final.rename(columns={'video_id': 'nb_videos_analysed', 'category_id': 'main_category_id'}, inplace=True)
df_final["engagement_rate"] = ( (df_final["likes"] + df_final["comments"]) / df_final["views"].replace(0, 1)) * 100
df_final.drop(columns=['views', 'likes','comments'], inplace=True)

In [47]:
df_channels= pd.merge(df_channels,df_final, how='left', left_on="uploads_playlist", right_on="playlist_id")

In [48]:
df_channels.info()

<class 'pandas.DataFrame'>
RangeIndex: 1463 entries, 0 to 1462
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  1463 non-null   str    
 1   title               1463 non-null   str    
 2   description         1463 non-null   str    
 3   country             1463 non-null   str    
 4   views               1463 non-null   int64  
 5   subscribers         1463 non-null   int64  
 6   nb_videos           1463 non-null   int64  
 7   uploads_playlist    1463 non-null   str    
 8   topics              1463 non-null   object 
 9   hashtags            95 non-null     object 
 10  playlist_id         1462 non-null   str    
 11  main_category_id    1462 non-null   str    
 12  nb_videos_analysed  1462 non-null   float64
 13  engagement_rate     1462 non-null   float64
dtypes: float64(2), int64(3), object(2), str(7)
memory usage: 160.1+ KB


In [49]:
# COLONNE HASHTAGS

# extraction des hashtags des colonnes title et description
text_title_description = df_channels['title'].fillna('') + " " + df_channels['description'].fillna('')
df_channels['hashtags'] = text_title_description.apply(
    lambda x: list(set(re.findall(r"#(\w+)", x))) or None
)



In [50]:
# COLONNE TOPICS - extraire les noms de thèmes

def extract_topic_name(topic_list):
    # Si la valeur est None ou n'est pas une liste/iterable
    if not isinstance(topic_list, list):
        return ""
    
    # Nettoyage de chaque URL dans la liste
    clean_list = list(set(url.split('/')[-1].replace('_', ' ') for url in topic_list))
    
    return clean_list

df_channels['topics'] = df_channels['topics'].apply(extract_topic_name)

In [None]:
df['Tags'] = df['Tags'].str.split(',')
df_exploded =  df.explode('Tags').copy()

In [53]:
df_channels.drop(columns=['playlist_id'], inplace=True)

In [55]:
df_channels.to_csv("chaines.csv",index=False,sep=',', encoding='utf-8-sig',quoting=1)