In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from langdetect import detect , LangDetectException

import pandas as pd
import seaborn as sns

from datetime import datetime
import time
import re

pd.set_option('display.max_rows', None)

In [3]:
api_key = 'your_api_key'

youtube = build("youtube", "v3", developerKey=api_key)

# Channels

## Getting all Channels with key words
keyWords: crime, agression, mysterious, cold case, اختفاء, criminal, disparition, جرائم, murder

In [None]:
def process_channel_crime(response):
  channel_id = []

  for channel in response["items"]:
    channel_id.append(channel["id"]["channelId"])

  return channel_id


def get_all_channels(youtube, keyword):
    all_channels_id = []
    page_token = None

    while True:
      request = youtube.search().list(
        part="snippet",
        maxResults=50,
        q=keyword,
        type="channel",
        pageToken = page_token
      )
      response = request.execute()
      all_channels_id.extend(process_channel_crime(response))

      if response.get("nextPageToken"):
        page_token = response["nextPageToken"]
      else:
        break


    return all_channels_id


The YouTube API has a daily quota limit, so we divide the data fetching process into smaller chunks.

By doing this, we can spread the API calls across different keywords and avoid exceeding the quota.

In [None]:
chunk1 = get_all_channels(youtube, "crime")
chunk2 = get_all_channels(youtube, "agression")
chunk3 = get_all_channels(youtube, "mysterious")

In [None]:
chunk4 = get_all_channels(youtube, "cold case")
chunk5 = get_all_channels(youtube, "اختفاء")

In [None]:
chunk6 = get_all_channels(youtube, "Criminal")
chunk7 = get_all_channels(youtube, "disparition")

In [None]:
chunk8 = get_all_channels(youtube, "جرائم")
chunk9 = get_all_channels(youtube, "murder")

In [None]:
chunk10 = get_all_channels(youtube, "अपराध")

In [None]:
chunks = [chunk1, chunk2, chunk3, chunk4, chunk5, chunk6, chunk7, chunk8, chunk9,chunk10]
all_channel_ids = []
for chunk in chunks:

  all_channel_ids.extend(chunk)

len(all_channel_ids)

3965

In [None]:
# Selecting unique channel_ids
all_channel_id = list(set(all_channel_ids))
len(all_channel_id)

3627

In case of knowing some channels that weren't added to the list

In [None]:
# Adding some channel_ids
all_channel_id.extend(["UCVRtufWJKDFI9WH1IVRE9PQ","UC0cTE2t_6udTuM1aggjMqzw"])

In [None]:
# Save in a file 
with open("crime_channel_ids.txt", 'w') as file:
    for item in all_channel_id:
        file.write(item + '\n')

## Getting channel stats
After getting the ids of channels about crime, we'll use the saved file with ids to get more informations

In [None]:
with open('crime_channel_ids.txt', 'r') as file:
    all_channel_id = []
    for line in file:
        all_channel_id.append(line.strip())

In [None]:
def process_channel(response):

  all_stats = []

  for channel in response["items"]:
    dict_stats = dict(
        id = channel["id"],
        name = channel["snippet"]["title"],
        description = channel["snippet"]["description"],
        #customUrl = channel["snippet"]["customUrl"],
        publishedAt = channel["snippet"]["publishedAt"],
        #language = channel["snippet"]["defaultLanguage"],
        total_videos = channel["statistics"]["videoCount"],
        nb_subscribers = channel["statistics"]["subscriberCount"],
        nb_views = channel["statistics"]["viewCount"],
        playlist_id = channel["contentDetails"]["relatedPlaylists"]["uploads"],

    )

    if 'country' in channel['snippet']:
      dict_stats["country"] = channel["snippet"]["country"]
    elif 'country' in channel['brandingSettings']['channel']:
      dict_stats["country"] = channel["brandingSettings"]["channel"]["country"]
    else :
      dict_stats["country"] = None


    if 'defaultLanguage' in channel['snippet']:
      dict_stats["defaultLanguage"] = channel["snippet"]["defaultLanguage"]
    else:
      dict_stats["defaultLanguage"] = None

    if 'topicDetails' in channel:
      if 'topicCategories' in channel['topicDetails']:
        dict_stats["topicCategories"] = channel["topicDetails"]["topicCategories"]
      else :
        dict_stats["topicCategories"]  = None



    all_stats.append(dict_stats)

  return all_stats


#Getting the channels statistics

def get_channel_stats(youtube, channel_ids):
  all_stats = []

  for i in range(0,len(channel_ids),50):
    try:

      request = youtube.channels().list(
        part = "snippet,contentDetails,statistics,brandingSettings,topicDetails",
        id = ','.join(channel_ids[i:i+50])
        )
      response = request.execute()

      all_stats.extend(process_channel(response))

    except HttpError as e:
      print(f"Error retrieving channel details for channel ID {id}: {e}")

  return all_stats

In [None]:
channel_stats = get_channel_stats(youtube, all_channel_id)

In [None]:
len(channel_stats)

3627

In [None]:
# Put the fetched info in a DataFrame
df_channel_stats = pd.DataFrame(channel_stats)
df_channel_stats.head()

Unnamed: 0,id,name,description,publishedAt,total_videos,nb_subscribers,nb_views,playlist_id,country,defaultLanguage,topicCategories
0,UCZWMrtFt6IVPZu6kloo7J4g,Cold Case - Topic,,2021-08-26T11:23:08.684823Z,12,33,895,UUZWMrtFt6IVPZu6kloo7J4g,,,"[https://en.wikipedia.org/wiki/Hip_hop_music, ..."
1,UCBFV7W4f1kRgwUf3pCWWlQg,SFSU Department of Criminal Justice Studies,,2020-06-09T19:00:22.293132Z,12,30,2850,UUBFV7W4f1kRgwUf3pCWWlQg,,,"[https://en.wikipedia.org/wiki/Knowledge, http..."
2,UCImlTJDmo1hn1EDgohNM37g,Red Stories | قصص ريد,Red Stories |قصص ريد\nنشر قصص واقعيه .. \n,2015-01-18T13:13:10Z,543,88000,484371,UUImlTJDmo1hn1EDgohNM37g,SA,ar,[https://en.wikipedia.org/wiki/Society]
3,UC8TIxyIwc91WQ044Vibkscw,TSoV Agression,,2014-02-07T22:35:25Z,8,15,277,UU8TIxyIwc91WQ044Vibkscw,,,[https://en.wikipedia.org/wiki/Role-playing_vi...
4,UCQWTgc4R3VAHPendd85oLlw,BC Brooklyn Criminal,BC AKA BROOKLYN CRIMINAL,2018-11-28T11:27:07Z,2,121,1415,UUQWTgc4R3VAHPendd85oLlw,US,,"[https://en.wikipedia.org/wiki/Music, https://..."


In [None]:
df_channel_stats.shape

(3627, 11)

In [None]:
df_channel_stats.to_csv("channel_stats.txt")

## Data Exploration

In [None]:
df = pd.read_csv("channel_stats.txt", index_col = 0)
df.shape

(3627, 11)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3627 entries, 0 to 3626
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               3627 non-null   object
 1   name             3627 non-null   object
 2   description      2102 non-null   object
 3   publishedAt      3627 non-null   object
 4   total_videos     3627 non-null   int64 
 5   nb_subscribers   3627 non-null   int64 
 6   nb_views         3627 non-null   int64 
 7   playlist_id      3627 non-null   object
 8   country          1351 non-null   object
 9   defaultLanguage  125 non-null    object
 10  topicCategories  2750 non-null   object
dtypes: int64(3), object(8)
memory usage: 340.0+ KB


In [None]:
df = df.astype({ 'total_videos':'int', 'nb_subscribers':'int', 'nb_views':'int' })
df.dtypes

In [None]:
# Convert to date and time

df[["date_published", "time_published"]] =  df["publishedAt"].str.split(pat='T', expand=True)

df['time_published'] = df['time_published'].str.replace('Z','')
df['date_published'] = pd.to_datetime(df['date_published'])
df['time_published'] = pd.to_datetime(df['time_published']).dt.time

df.drop('publishedAt', axis = 1 , inplace=True)

df.info()

  df['time_published'] = pd.to_datetime(df['time_published']).dt.time


<class 'pandas.core.frame.DataFrame'>
Index: 3627 entries, 0 to 3626
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               3627 non-null   object        
 1   name             3627 non-null   object        
 2   description      2102 non-null   object        
 3   total_videos     3627 non-null   int64         
 4   nb_subscribers   3627 non-null   int64         
 5   nb_views         3627 non-null   int64         
 6   playlist_id      3627 non-null   object        
 7   country          1351 non-null   object        
 8   defaultLanguage  125 non-null    object        
 9   topicCategories  2750 non-null   object        
 10  date_published   3627 non-null   datetime64[ns]
 11  time_published   3627 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 368.4+ KB


In [None]:
df['time_published'] = pd.to_datetime(df['time_published'], format='%H:%M:%S').dt.time

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3627 entries, 0 to 3626
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               3627 non-null   object        
 1   name             3627 non-null   object        
 2   description      2102 non-null   object        
 3   total_videos     3627 non-null   int64         
 4   nb_subscribers   3627 non-null   int64         
 5   nb_views         3627 non-null   int64         
 6   playlist_id      3627 non-null   object        
 7   country          1351 non-null   object        
 8   defaultLanguage  125 non-null    object        
 9   topicCategories  2750 non-null   object        
 10  date_published   3627 non-null   datetime64[ns]
 11  time_published   3627 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 368.4+ KB


## Clean topic column

In [None]:
import ast

def extract_topics(x):
  topics = []
  if pd.isnull(x):
    return None
  urls = ast.literal_eval(x)
  for url in urls:
    topic = url.split('/')[-1]
    topics.append(topic)


  return topics

df["topics"] = df["topicCategories"].apply(extract_topics)

In [None]:
df.head()

Unnamed: 0,id,name,description,total_videos,nb_subscribers,nb_views,playlist_id,country,defaultLanguage,topicCategories,date_published,time_published,topics
0,UCZWMrtFt6IVPZu6kloo7J4g,Cold Case - Topic,,12,33,895,UUZWMrtFt6IVPZu6kloo7J4g,,,['https://en.wikipedia.org/wiki/Hip_hop_music'...,2021-08-26,11:23:08,"[Hip_hop_music, Rock_music, Electronic_music, ..."
1,UCBFV7W4f1kRgwUf3pCWWlQg,SFSU Department of Criminal Justice Studies,,12,30,2850,UUBFV7W4f1kRgwUf3pCWWlQg,,,"['https://en.wikipedia.org/wiki/Knowledge', 'h...",2020-06-09,19:00:22,"[Knowledge, Society]"
2,UCImlTJDmo1hn1EDgohNM37g,Red Stories | قصص ريد,Red Stories |قصص ريد\nنشر قصص واقعيه .. \n,543,88000,484371,UUImlTJDmo1hn1EDgohNM37g,SA,ar,['https://en.wikipedia.org/wiki/Society'],2015-01-18,13:13:10,[Society]
3,UC8TIxyIwc91WQ044Vibkscw,TSoV Agression,,8,15,277,UU8TIxyIwc91WQ044Vibkscw,,,['https://en.wikipedia.org/wiki/Role-playing_v...,2014-02-07,22:35:25,"[Role-playing_video_game, Action_game, Video_g..."
4,UCQWTgc4R3VAHPendd85oLlw,BC Brooklyn Criminal,BC AKA BROOKLYN CRIMINAL,2,121,1415,UUQWTgc4R3VAHPendd85oLlw,US,,"['https://en.wikipedia.org/wiki/Music', 'https...",2018-11-28,11:27:07,"[Music, Hip_hop_music, Pop_music]"


**Detecting the unique topics**

In [None]:
all_topics = []
for index, row in df.iterrows():
  if row["topics"] is not None:
    all_topics.extend(row["topics"])


set_topics = set(all_topics)
set_topics

{'Action-adventure_game',
 'Action_game',
 'Association_football',
 'Basketball',
 'Boxing',
 'Business',
 'Casual_game',
 'Christian_music',
 'Classical_music',
 'Country_music',
 'Cricket',
 'Electronic_music',
 'Entertainment',
 'Fashion',
 'Film',
 'Food',
 'Health',
 'Hip_hop_music',
 'Hobby',
 'Humour',
 'Independent_music',
 'Jazz',
 'Knowledge',
 'Lifestyle_(sociology)',
 'Military',
 'Mixed_martial_arts',
 'Music',
 'Music_of_Asia',
 'Music_of_Latin_America',
 'Music_video_game',
 'Performing_arts',
 'Pet',
 'Physical_fitness',
 'Politics',
 'Pop_music',
 'Professional_wrestling',
 'Puzzle_video_game',
 'Racing_video_game',
 'Reggae',
 'Religion',
 'Rhythm_and_blues',
 'Rock_music',
 'Role-playing_video_game',
 'Simulation_video_game',
 'Society',
 'Soul_music',
 'Sport',
 'Sports_game',
 'Strategy_video_game',
 'Technology',
 'Television_program',
 'Tourism',
 'Vehicle',
 'Video_game_culture',
 'Volleyball'}

In [None]:
# drop rows with channel not about crimes

def delete_no_crime(keys):
  key_to_del = ['Food', 'Music', 'Humour', 'Fashion', 'Sport', 'Hobby', 'Technology', 'Health', 'Sport', 'Tourism', 'Religion', "Lifestyle_(sociology)","Vehicle",'Cricket',"Volleyball","Physical_fitness","Pet"]

  if keys is None:
    return None

  for key in keys :
    if key in key_to_del:
      return "Yes"

    if 'game' in key or 'art' in key or 'music' in key or 'Music' in key:
      return "Yes"

  return "No"

df["del"] = df["topics"].apply(delete_no_crime)

In [None]:
df_crime = df[~(df["del"]=="Yes")]
df_crime = df_crime[~(df["total_videos"]==0)]

df_crime.shape

  df_crime = df_crime[~(df["total_videos"]==0)]


(1646, 14)

In [None]:
all_topics = []
for index, row in df_crime.iterrows():
  if row["topics"] is not None:
    all_topics.extend(row["topics"])


set_topics = set(all_topics)
set_topics

{'Business',
 'Entertainment',
 'Film',
 'Knowledge',
 'Military',
 'Politics',
 'Society',
 'Television_program'}

In [None]:
df_crime["del"].value_counts(dropna=False)

del
No      947
None    699
Name: count, dtype: int64

In [None]:
df_crime.sort_values(by="nb_subscribers", ascending=False).head()

Unnamed: 0,id,name,description,total_videos,nb_subscribers,nb_views,playlist_id,country,defaultLanguage,topicCategories,date_published,time_published,topics,del
3392,UCzXpAx4v6rFLYXkJZ5nmSvQ,LIV Crime,Love crime shows? This is just the destination...,14908,25400000,13453654141,UUzXpAx4v6rFLYXkJZ5nmSvQ,IN,,['https://en.wikipedia.org/wiki/Entertainment'...,2020-08-18,11:32:26,"[Entertainment, Television_program, Film]",No
2706,UCsNdeLwEZf86swPD3qJJ7Dw,News Nation,News Nation is a leading Hindi news channel in...,122412,13100000,3169472857,UUsNdeLwEZf86swPD3qJJ7Dw,IN,,,2012-12-27,06:52:12,,
163,UCpDxPj3sm40ISX5hn-TlYcw,Crime Tak,आज वक़्त के जिस दौर में हम जी रहे हैं उसमें आन...,24651,9080000,3973510942,UUpDxPj3sm40ISX5hn-TlYcw,IN,,['https://en.wikipedia.org/wiki/Society'],2018-06-11,08:05:33,[Society],No
1988,UCcjk-KvDJBEvo8fJS29fekg,Mysterious Dunia,"Mysterious Dunia brings Fun, technical, educat...",849,6540000,891910511,UUcjk-KvDJBEvo8fJS29fekg,IN,,['https://en.wikipedia.org/wiki/Entertainment'...,2015-06-01,19:35:31,"[Entertainment, Society]",No
3454,UCGnCvNgWZ3T7hJJajjGYucA,Kwili,( محتوى هذه القناة غير موجه للأطفال للبالغين ف...,244,6380000,569648475,UUGnCvNgWZ3T7hJJajjGYucA,KW,,"['https://en.wikipedia.org/wiki/Society', 'htt...",2012-08-04,21:38:35,"[Society, Entertainment]",No


In [None]:
# Testing the detect language function
text = "THANK YOU SO MUCH FOR YOU."
language = detect(text)

print("Detected language:", language)

Detected language: en


In [None]:
def detect_language(text):
    try:
      language = detect(text)
      return language
    except TypeError:
      return None
    except LangDetectException:
      return None
    
# If the 'defaultLanguage' column is missing, automatically detect the language from the 'description' field. Otherwise, keep the 'defaultLanguage' as it is.
df_crime['detectedLanguage'] = df_crime.apply(lambda row: detect_language(row["description"]) if pd.isnull(row["defaultLanguage"]) else row["defaultLanguage"] , axis=1)

# If the 'detectedLanguage' is still missing after the first step, detect the language from the 'name' field. If 'detectedLanguage' is already populated, keep its current value.
df_crime['detectedLanguage'] = df_crime.apply(lambda row: detect_language(row["name"]) if pd.isnull(row["detectedLanguage"]) else row["detectedLanguage"] , axis=1)

df_crime.head(10)

the detected language from the description and the name isn't representative of the language of the country. So i will fetch the audio langage of videos in the channels' playlist

In [None]:
df_crime[["name","country","detectedLanguage"]][df_crime["country"]=='IN']

Unnamed: 0,name,country,detectedLanguage
6,MIRROR POST,IN,hi
10,Cyber Safety,IN,hi
25,ADVOCATE AMIT RAVAL,IN,en
31,Gyanam Online,IN,en
39,Adv Aashi Sharma,IN,so
...,...,...,...
3569,CYBER POLICE GORAKHPUR,IN,en
3574,The अपराध,IN,en
3597,Desawar Baba,IN,en
3603,Hitech News Telangana,IN,hi


## Getting audio language in the first video in a channel playlist

**Getting audio language in the first video in a channel playlist**

In [None]:
def get_default_audio_language(video_id):
  try:
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    if "defaultAudioLanguage" in response["items"][0]["snippet"]:
      return response["items"][0]["snippet"]["defaultAudioLanguage"]
    return None

  except HttpError as e:
    print(f"Error retrieving video details for video ID {id}: {e}")
    return None



def get_videoId_from_playlist(playlist_id):
  try:
    request = youtube.playlistItems().list(
        part="snippet",
        maxResults=1,
        playlistId=playlist_id
    )
    response = request.execute()
    video_id = response["items"][0]["snippet"]["resourceId"]["videoId"]

    return video_id

  except HttpError as e:
      print(f"Error retrieving channel details for channel ID {id}: {e}")
      return None

In [None]:
for index, row in df_crime.iterrows():

    playlistID = row["playlist_id"]
    videoID = get_videoId_from_playlist(playlistID)
    if videoID is not None:
      language = get_default_audio_language(videoID)
      df_crime.at[index, 'audio_language'] = language
    else:
      df_crime.at[index, 'audio_language'] = None


Error retrieving channel details for channel ID <built-in function id>: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=1&playlistId=UUt4J3j3RMoPgAny0eVQgyhg&key=AIzaSyCksKBZv1MNpW78giE7OOcUR8XmUicHZWk&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". Details: "[{'message': "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.", 'domain': 'youtube.playlistItem', 'reason': 'playlistNotFound', 'location': 'playlistId', 'locationType': 'parameter'}]">
Error retrieving channel details for channel ID <built-in function id>: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=1&playlistId=UUWHHvJTtqdL66CZV7XS3c_Q&key=AIzaSyCksKBZv1MNpW78giE7OOcUR8XmUicHZWk&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found."

In [None]:
# df_crime[df_crime["audio_language"].isna()]
# df_crime["audio_language"].isna().sum()
df_crime.head(10)

Unnamed: 0,id,name,description,total_videos,nb_subscribers,nb_views,playlist_id,country,defaultLanguage,topicCategories,date_published,time_published,topics,del,detectedLanguage,audio_language
4,UC5qbKHUFVWgSURtTsn_qJbg,Mix Media,#قناة_ميكس_ميديا تقدم العديد من البرامج المختل...,21,4150,172643,UU5qbKHUFVWgSURtTsn_qJbg,SA,,['https://en.wikipedia.org/wiki/Knowledge'],2013-10-28,20:23:48,[Knowledge],No,ar,ar
6,UCVHrHOoTKt9MU8ji7Dwo5rw,Crime Reporter,WBIJAJCIE NA NASZEGO FACEBOOKA!\nhttps://www.f...,539,79600,12171241,UUVHrHOoTKt9MU8ji7Dwo5rw,US,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",2022-07-16,18:20:30,"[Politics, Society]",No,en,pl
17,UCS0EIzdO03YnTefgrrLOtvw,True Crime বাংলা,,1,1,217,UUS0EIzdO03YnTefgrrLOtvw,BD,,,2024-05-09,05:24:58,,,bn,
20,UCyEoJImyvuIwRxVIJ0kVyMA,mystery crime by emna | جرائم غامضة,,11,15,984,UUyEoJImyvuIwRxVIJ0kVyMA,,,,2021-06-08,13:21:56,,,en,
23,UC0grc5Poc8SB5dPdIVbaayg,Mysterious Chap remixed by M & M : Ezemro - Topic,,1,0,10,UU0grc5Poc8SB5dPdIVbaayg,,,,2020-11-04,12:02:30,,,en,
30,UCNvi9OArR7VGoJPZTraNN2Q,Missing People - Cold Case,"Welcome to MISSING PEOPLE - COLD CASE, the cha...",35,15,3623,UUNvi9OArR7VGoJPZTraNN2Q,,,['https://en.wikipedia.org/wiki/Society'],2023-07-12,15:37:49,[Society],No,en,en
31,UCfbrPxNOM2rMBjC_BcxUKBg,The Bad Lieutenant,🚨THE BAD LIEUTENANT BRINGS YOU DOWN AND DIRTY ...,28,179,59108,UUfbrPxNOM2rMBjC_BcxUKBg,,,['https://en.wikipedia.org/wiki/Society'],2024-04-27,17:47:07,[Society],No,en,
33,UCMWzwWM9-WLiOvlnECTOClw,Fatenah Dwairi,🌻\nMy Instagram: selflove_unconditional \n,145,7860,302009,UUMWzwWM9-WLiOvlnECTOClw,JO,,['https://en.wikipedia.org/wiki/Knowledge'],2017-02-24,04:13:14,[Knowledge],No,en,ar
34,UCo78MUaamZzP768_cA3JAVQ,Batool's Sories،قصص بتول,"""مرحبًا بكم في قناة قصص بتول Batool's Stories ...",913,44100,12448552,UUo78MUaamZzP768_cA3JAVQ,US,,['https://en.wikipedia.org/wiki/Society'],2020-11-13,21:36:22,[Society],No,ar,ar
36,UC42566QQPPkFCq4At00JfRg,Crime Kahani,"Real story sunne ke liye hamare channel ""CRIME...",2,0,0,UU42566QQPPkFCq4At00JfRg,,,,2024-05-04,01:28:24,,,en,


Now we have the detected Language and the audio Language. In order to fill the NaN, I'm going to select the more suitable language.

In [None]:
df_crime["selected_language"] = df_crime.apply(lambda row: row["detectedLanguage"] if pd.isnull(row["audio_language"]) else row["audio_language"], axis=1 )

In [None]:
df_crime["selected_language"].value_counts(dropna=False)

selected_language
ar       469
en       420
hi       259
fr        95
en-US     67
        ... 
lv         1
en-CA      1
ase        1
ps         1
fi         1
Name: count, Length: 63, dtype: int64

In [None]:
df_crime.to_csv("channel_stats.txt")

# Videos

## Getting all videos with keywords
keyWords: crime, agression, mysterious, cold case, اختفاء, criminal, disparition, جرائم, murder

In [None]:
import time

def process_video_crime(response):
  video_id = []

  for video in response["items"]:
    video_id.append(video["id"]["videoId"])

  return video_id


def get_all_videos(youtube, keyword):
    all_video_ids = []
    page_token = None
    try :
      # while True:
      for i in range(20):
        request = youtube.search().list(
          part="snippet",
          maxResults=50,
          order="relevance",
          q=keyword,
          type="video",
          pageToken = page_token
        )
        response = request.execute()
        all_video_ids.extend(process_video_crime(response))

        if response.get("nextPageToken"):
          page_token = response["nextPageToken"]
          time.sleep(2)
        else:
          break

    except HttpError as e:
      print(f"Error retrieving channel details for channel ID {id}: {e}")


    return all_video_ids

The YouTube API has a daily quota limit, so we divide the data fetching process into smaller chunks.

By doing this, we can spread the API calls across different keywords and avoid exceeding the quota.

In [None]:
chunk1 = get_all_videos(youtube, 'crime -music -food -humour -health -sport -game -art -lifestyle')
chunk2 = get_all_videos(youtube, 'agression -music -food -humour -health -sport -game -art -lifestyle,')
chunk3 = get_all_videos(youtube, 'جرائم -music -food -humour -health -sport -game -art -lifestyle,')

In [None]:
chunk4 = get_all_videos(youtube, 'murder -music -food -humour -health -sport -game -art -lifestyle,')
chunk5 = get_all_videos(youtube, 'اختفاء -music -food -humour -health -sport -game -art -lifestyle,')
chunk6 = get_all_videos(youtube, '"अपराध" -music -food -humour -health -sport -game -art -lifestyle,')

In [None]:
chunk7 = get_all_videos(youtube, 'cold case -music -food -humour -health -sport -game -art -lifestyle,')
chunk8 = get_all_videos(youtube, 'Criminal -music -food -humour -health -sport -game -art -lifestyle,')

In [None]:
all_video_ids = []

chunks = [chunk1, chunk2, chunk3, chunk4, chunk5, chunk6, chunk7,chunk8]
for chunk in chunks:

  all_video_ids.extend(chunk)

len(all_video_ids)

4269

In [None]:
all_video_id = list(set(all_video_ids))
len(all_video_id)

3426

In [None]:
with open("crime_video_ids.txt", 'w') as file:
    for item in all_video_id:
        file.write(item + '\n')

## Getting Video Stats

In [None]:
with open('crime_video_ids.txt', 'r') as file:
    all_video_id = []
    for line in file:
        all_video_id.append(line.strip())

In [None]:
len(all_video_id)

3426

In [None]:
def process_video(response):

  all_stats = []

  for video in response["items"]:
    dict_stats = dict(
        video_id = video["id"],
        title = video["snippet"]["title"],
        description = video["snippet"]["description"],
        publishedAt = video["snippet"]["publishedAt"],
        categoryId = video["snippet"]["categoryId"],
        duration = video["contentDetails"]["duration"],
        # nb_views = video["statistics"]["viewCount"],
        channel_id = video["snippet"]["channelId"],
    )

    if 'commentCount' in video["statistics"]:
      dict_stats['nb_comments'] = video["statistics"]["commentCount"]
    else:
      dict_stats['nb_comments'] = None

    if 'viewCount' in video["statistics"]:
      dict_stats['nb_views'] = video["statistics"]["viewCount"]
    else:
      dict_stats['nb_views'] = None

    if 'likeCount' in video["statistics"]:
      dict_stats['nb_likes'] = video["statistics"]["likeCount"]
    else:
      dict_stats['nb_likes'] = None

    if 'defaultAudioLanguage' in video['snippet']:
      dict_stats["defaultAudioLanguage"] = video["snippet"]["defaultAudioLanguage"]
    else:
      dict_stats["defaultAudioLanguage"] = None

    if 'topicDetails' in video:
      if 'topicCategories' in video['topicDetails']:
        dict_stats["topicCategories"] = video["topicDetails"]["topicCategories"]
      else :
        dict_stats["topicCategories"]  = None

    all_stats.append(dict_stats)

  return all_stats


#Getting the channels statistics

def get_video_stats(youtube, video_ids):
  all_stats = []

  for i in range(0,len(video_ids),50):
    try:

      request = youtube.videos().list(
        part="snippet,contentDetails,statistics,topicDetails",
        id= ','.join(video_ids[i:i+50])
      )

      response = request.execute()

      all_stats.extend(process_video(response))

    except HttpError as e:
      print(f"Error retrieving video details for video ID {id}: {e}")

  return all_stats

In [None]:
video_stats = get_video_stats(youtube, all_video_id)

In [None]:
len(video_stats)

3419

In [None]:
df_video_stats = pd.DataFrame(video_stats)
df_video_stats.head()

Unnamed: 0,video_id,title,description,publishedAt,categoryId,duration,channel_id,nb_comments,nb_views,nb_likes,defaultAudioLanguage,topicCategories
0,TaEuPHkfcS0,وثائقي | جرائم النازية - اختطاف الأطفال | وثائ...,عندما كانوا أطفالًا، اختطفهم النازيون من بولند...,2020-04-27T19:00:11Z,27,PT42M27S,UCET6sWl4Xcu-U8Ka9PJPrwA,613,531167,5466,ar,[https://en.wikipedia.org/wiki/Society]
1,ESN_wDg0DUo,Man found guilty in 30-year-old cold case invo...,Thirty years after single mother Barbara Brodk...,2023-02-25T15:17:49Z,25,PT1M58S,UCxUD8G1jO8T-Ef2tuADCZOA,15,7068,173,en-CA,[https://en.wikipedia.org/wiki/Society]
2,7_8V-u9WJXY,Asi es como murieron estos asesinos seriales #...,,2022-04-07T16:44:56Z,20,PT40S,UCJXiGyoWSDJhIJ1ztB0yChQ,2458,4983804,354463,,
3,2B4emTu7YUs,El POLICÍA ARGENTINO que ASESlNÒ a su ESPOSA y...,¡Hola a todos! \nMi nombre es María y cada sem...,2024-05-13T20:34:27Z,25,PT22M10S,UCK_-_4AP8QCJLuoVG8mxPMA,97,56634,3171,es,[https://en.wikipedia.org/wiki/Society]
4,-ETWNxemJWk,Encore une agression verbale...,Voilà ce qu'il s'est passé vendredi dernier. E...,2024-05-27T08:28:49Z,22,PT1M41S,UCYYIOXsDpE2Zy2rjOClx6nA,0,7,1,,[https://en.wikipedia.org/wiki/Society]


In [None]:
df_video_stats.shape

(3419, 12)

In [None]:
df_video_stats.to_csv("video_stats.csv")

## Videos stat cleaning


In [None]:
df_vid = pd.read_csv('video_stats.csv', index_col = 0)
df_vid.head()

Unnamed: 0,video_id,title,description,publishedAt,categoryId,duration,channel_id,nb_comments,nb_views,nb_likes,defaultAudioLanguage,topicCategories
0,TaEuPHkfcS0,وثائقي | جرائم النازية - اختطاف الأطفال | وثائ...,عندما كانوا أطفالًا، اختطفهم النازيون من بولند...,2020-04-27T19:00:11Z,27,PT42M27S,UCET6sWl4Xcu-U8Ka9PJPrwA,613.0,531167.0,5466.0,ar,['https://en.wikipedia.org/wiki/Society']
1,ESN_wDg0DUo,Man found guilty in 30-year-old cold case invo...,Thirty years after single mother Barbara Brodk...,2023-02-25T15:17:49Z,25,PT1M58S,UCxUD8G1jO8T-Ef2tuADCZOA,15.0,7068.0,173.0,en-CA,['https://en.wikipedia.org/wiki/Society']
2,7_8V-u9WJXY,Asi es como murieron estos asesinos seriales #...,,2022-04-07T16:44:56Z,20,PT40S,UCJXiGyoWSDJhIJ1ztB0yChQ,2458.0,4983804.0,354463.0,,
3,2B4emTu7YUs,El POLICÍA ARGENTINO que ASESlNÒ a su ESPOSA y...,¡Hola a todos! \nMi nombre es María y cada sem...,2024-05-13T20:34:27Z,25,PT22M10S,UCK_-_4AP8QCJLuoVG8mxPMA,97.0,56634.0,3171.0,es,['https://en.wikipedia.org/wiki/Society']
4,-ETWNxemJWk,Encore une agression verbale...,Voilà ce qu'il s'est passé vendredi dernier. E...,2024-05-27T08:28:49Z,22,PT1M41S,UCYYIOXsDpE2Zy2rjOClx6nA,0.0,7.0,1.0,,['https://en.wikipedia.org/wiki/Society']


In [None]:
df_vid.shape

(3419, 12)

In [None]:
df_vid["topics"] = df_vid["topicCategories"].apply(extract_topics) # The function 'extract_topics' is defined in the section before
df_vid.head()

Unnamed: 0,video_id,title,description,publishedAt,categoryId,duration,channel_id,nb_comments,nb_views,nb_likes,defaultAudioLanguage,topicCategories,topics
0,TaEuPHkfcS0,وثائقي | جرائم النازية - اختطاف الأطفال | وثائ...,عندما كانوا أطفالًا، اختطفهم النازيون من بولند...,2020-04-27T19:00:11Z,27,PT42M27S,UCET6sWl4Xcu-U8Ka9PJPrwA,613.0,531167.0,5466.0,ar,['https://en.wikipedia.org/wiki/Society'],[Society]
1,ESN_wDg0DUo,Man found guilty in 30-year-old cold case invo...,Thirty years after single mother Barbara Brodk...,2023-02-25T15:17:49Z,25,PT1M58S,UCxUD8G1jO8T-Ef2tuADCZOA,15.0,7068.0,173.0,en-CA,['https://en.wikipedia.org/wiki/Society'],[Society]
2,7_8V-u9WJXY,Asi es como murieron estos asesinos seriales #...,,2022-04-07T16:44:56Z,20,PT40S,UCJXiGyoWSDJhIJ1ztB0yChQ,2458.0,4983804.0,354463.0,,,
3,2B4emTu7YUs,El POLICÍA ARGENTINO que ASESlNÒ a su ESPOSA y...,¡Hola a todos! \nMi nombre es María y cada sem...,2024-05-13T20:34:27Z,25,PT22M10S,UCK_-_4AP8QCJLuoVG8mxPMA,97.0,56634.0,3171.0,es,['https://en.wikipedia.org/wiki/Society'],[Society]
4,-ETWNxemJWk,Encore une agression verbale...,Voilà ce qu'il s'est passé vendredi dernier. E...,2024-05-27T08:28:49Z,22,PT1M41S,UCYYIOXsDpE2Zy2rjOClx6nA,0.0,7.0,1.0,,['https://en.wikipedia.org/wiki/Society'],[Society]


**detecting all topics**

In [None]:
all_topics_vid = []
for index, row in df_vid.iterrows():
  if row["topics"] is not None:
    all_topics_vid.extend(row["topics"])


set_topics_vid = set(all_topics_vid)
set_topics_vid

{'Action-adventure_game',
 'Action_game',
 'Association_football',
 'Boxing',
 'Casual_game',
 'Classical_music',
 'Cricket',
 'Electronic_music',
 'Entertainment',
 'Film',
 'Health',
 'Hip_hop_music',
 'Hobby',
 'Humour',
 'Independent_music',
 'Jazz',
 'Knowledge',
 'Lifestyle_(sociology)',
 'Military',
 'Mixed_martial_arts',
 'Motorsport',
 'Music',
 'Music_of_Asia',
 'Music_of_Latin_America',
 'Performing_arts',
 'Pet',
 'Politics',
 'Pop_music',
 'Professional_wrestling',
 'Reggae',
 'Religion',
 'Rhythm_and_blues',
 'Rock_music',
 'Role-playing_video_game',
 'Simulation_video_game',
 'Society',
 'Soul_music',
 'Sport',
 'Strategy_video_game',
 'Technology',
 'Television_program',
 'Tourism',
 'Vehicle',
 'Video_game_culture',
 'Volleyball'}

In [None]:
df_vid["del"] = df_vid["topics"].apply(delete_no_crime)
df_vid.head()

Unnamed: 0,video_id,title,description,publishedAt,categoryId,duration,channel_id,nb_comments,nb_views,nb_likes,defaultAudioLanguage,topicCategories,topics,del
0,TaEuPHkfcS0,وثائقي | جرائم النازية - اختطاف الأطفال | وثائ...,عندما كانوا أطفالًا، اختطفهم النازيون من بولند...,2020-04-27T19:00:11Z,27,PT42M27S,UCET6sWl4Xcu-U8Ka9PJPrwA,613.0,531167.0,5466.0,ar,['https://en.wikipedia.org/wiki/Society'],[Society],No
1,ESN_wDg0DUo,Man found guilty in 30-year-old cold case invo...,Thirty years after single mother Barbara Brodk...,2023-02-25T15:17:49Z,25,PT1M58S,UCxUD8G1jO8T-Ef2tuADCZOA,15.0,7068.0,173.0,en-CA,['https://en.wikipedia.org/wiki/Society'],[Society],No
2,7_8V-u9WJXY,Asi es como murieron estos asesinos seriales #...,,2022-04-07T16:44:56Z,20,PT40S,UCJXiGyoWSDJhIJ1ztB0yChQ,2458.0,4983804.0,354463.0,,,,
3,2B4emTu7YUs,El POLICÍA ARGENTINO que ASESlNÒ a su ESPOSA y...,¡Hola a todos! \nMi nombre es María y cada sem...,2024-05-13T20:34:27Z,25,PT22M10S,UCK_-_4AP8QCJLuoVG8mxPMA,97.0,56634.0,3171.0,es,['https://en.wikipedia.org/wiki/Society'],[Society],No
4,-ETWNxemJWk,Encore une agression verbale...,Voilà ce qu'il s'est passé vendredi dernier. E...,2024-05-27T08:28:49Z,22,PT1M41S,UCYYIOXsDpE2Zy2rjOClx6nA,0.0,7.0,1.0,,['https://en.wikipedia.org/wiki/Society'],[Society],No


In [None]:
df_vid.shape

(3419, 14)

In [None]:
df_video = df_vid[~(df_vid["del"]=="Yes")]

df_video.shape

(2707, 14)

In [None]:
df_video.reset_index(inplace=True, drop=True)
df_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2707 entries, 0 to 2706
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   video_id              2707 non-null   object 
 1   title                 2707 non-null   object 
 2   description           2502 non-null   object 
 3   publishedAt           2707 non-null   object 
 4   categoryId            2707 non-null   int64  
 5   duration              2707 non-null   object 
 6   channel_id            2707 non-null   object 
 7   nb_comments           2603 non-null   float64
 8   nb_views              2646 non-null   float64
 9   nb_likes              2631 non-null   float64
 10  defaultAudioLanguage  1744 non-null   object 
 11  topicCategories       2508 non-null   object 
 12  topics                2508 non-null   object 
 13  del                   2508 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 296.2+ KB


**Handle missing values for nb_comments and nb_likes**

In [None]:
df_video[['nb_comments', 'nb_likes','nb_views']] = df_video[['nb_comments', 'nb_likes','nb_views']].fillna(0)

df_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2707 entries, 0 to 2706
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   video_id              2707 non-null   object 
 1   title                 2707 non-null   object 
 2   description           2502 non-null   object 
 3   publishedAt           2707 non-null   object 
 4   categoryId            2707 non-null   int64  
 5   duration              2707 non-null   object 
 6   channel_id            2707 non-null   object 
 7   nb_comments           2707 non-null   float64
 8   nb_views              2707 non-null   float64
 9   nb_likes              2707 non-null   float64
 10  defaultAudioLanguage  1744 non-null   object 
 11  topicCategories       2508 non-null   object 
 12  topics                2508 non-null   object 
 13  del                   2508 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 296.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_video[['nb_comments', 'nb_likes','nb_views']] = df_video[['nb_comments', 'nb_likes','nb_views']].fillna(0)


**Columns type Conversion**

In [None]:
df_video = df_video.astype({ 'nb_comments':'int', 'nb_likes':'int' , 'nb_views':'int' })

df_video[["date_published", "time_published"]] =  df_video["publishedAt"].str.split(pat='T', expand=True)

df_video['time_published'] = df_video['time_published'].str.replace('Z','')
df_video['date_published'] = pd.to_datetime(df_video['date_published'])
df_video['time_published'] = pd.to_datetime(df_video['time_published']).dt.time

df_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2707 entries, 0 to 2706
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   video_id              2707 non-null   object        
 1   title                 2707 non-null   object        
 2   description           2502 non-null   object        
 3   publishedAt           2707 non-null   object        
 4   categoryId            2707 non-null   int64         
 5   duration              2707 non-null   object        
 6   channel_id            2707 non-null   object        
 7   nb_comments           2707 non-null   int64         
 8   nb_views              2707 non-null   int64         
 9   nb_likes              2707 non-null   int64         
 10  defaultAudioLanguage  1744 non-null   object        
 11  topicCategories       2508 non-null   object        
 12  topics                2508 non-null   object        
 13  del               

  df_video['time_published'] = pd.to_datetime(df_video['time_published']).dt.time


**Clean the duration column and convert it to time**

In [None]:
def extract_duration(duration_str):
    # Define regular expression pattern and use it
    pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
    match = re.match(pattern, duration_str)

    if match:
        hours = int(match.group(1) or 0)
        minutes = int(match.group(2) or 0)
        seconds = int(match.group(3) or 0)
    else:
        hours, minutes, seconds = 0, 0, 0

    duration = datetime.strptime(f'{hours}:{minutes}:{seconds}', '%H:%M:%S').time()

    return duration

df_video['duration_time'] = df_video['duration'].apply(extract_duration)


In [None]:
df_video.head()

Unnamed: 0,video_id,title,description,publishedAt,categoryId,duration,channel_id,nb_comments,nb_views,nb_likes,defaultAudioLanguage,topicCategories,topics,del,date_published,time_published,duration_time
0,TaEuPHkfcS0,وثائقي | جرائم النازية - اختطاف الأطفال | وثائ...,عندما كانوا أطفالًا، اختطفهم النازيون من بولند...,2020-04-27T19:00:11Z,27,PT42M27S,UCET6sWl4Xcu-U8Ka9PJPrwA,613,531167,5466,ar,['https://en.wikipedia.org/wiki/Society'],[Society],No,2020-04-27,19:00:11,00:42:27
1,ESN_wDg0DUo,Man found guilty in 30-year-old cold case invo...,Thirty years after single mother Barbara Brodk...,2023-02-25T15:17:49Z,25,PT1M58S,UCxUD8G1jO8T-Ef2tuADCZOA,15,7068,173,en-CA,['https://en.wikipedia.org/wiki/Society'],[Society],No,2023-02-25,15:17:49,00:01:58
2,7_8V-u9WJXY,Asi es como murieron estos asesinos seriales #...,,2022-04-07T16:44:56Z,20,PT40S,UCJXiGyoWSDJhIJ1ztB0yChQ,2458,4983804,354463,,,,,2022-04-07,16:44:56,00:00:40
3,2B4emTu7YUs,El POLICÍA ARGENTINO que ASESlNÒ a su ESPOSA y...,¡Hola a todos! \nMi nombre es María y cada sem...,2024-05-13T20:34:27Z,25,PT22M10S,UCK_-_4AP8QCJLuoVG8mxPMA,97,56634,3171,es,['https://en.wikipedia.org/wiki/Society'],[Society],No,2024-05-13,20:34:27,00:22:10
4,-ETWNxemJWk,Encore une agression verbale...,Voilà ce qu'il s'est passé vendredi dernier. E...,2024-05-27T08:28:49Z,22,PT1M41S,UCYYIOXsDpE2Zy2rjOClx6nA,0,7,1,,['https://en.wikipedia.org/wiki/Society'],[Society],No,2024-05-27,08:28:49,00:01:41


**Replace categoryId by the definition of the categories retrieved from Youtube API**

In [None]:
def get_video_categories(api_key, region_code='FR', language='en'):

    # Make the API request to retrieve video categories
    request = youtube.videoCategories().list(
        part="snippet",
        regionCode=region_code,
        hl=language
    )
    response = request.execute()

    categories_dict = {}

    for item in response['items']:
        category_id = int(item['id'])
        category_title = item['snippet']['title']
        categories_dict[category_id] = category_title

    return categories_dict

video_categories = get_video_categories(api_key, region_code='FR', language='en')

print(video_categories)


{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers'}


In [None]:
# I will convert the keys to int but i wont use it after

int_dict = {int(key): value for key, value in video_categories.items()}

In [None]:
df_video['categories'] = df_video['categoryId'].map(video_categories)

**Handling the missing values in defaultAudioLanguage**

In [None]:
# I will detect the language from the description and if null from the title

df_video['Language'] = df_video.apply(lambda row: detect_language(row["description"]) if pd.isnull(row["defaultAudioLanguage"]) else row["defaultAudioLanguage"] , axis=1)

df_video['Language'] = df_video.apply(lambda row: detect_language(row["title"]) if pd.isnull(row["Language"]) else row["Language"] , axis=1)

In [None]:
df_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2707 entries, 0 to 2706
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   video_id              2707 non-null   object        
 1   title                 2707 non-null   object        
 2   description           2502 non-null   object        
 3   publishedAt           2707 non-null   object        
 4   categoryId            2707 non-null   int64         
 5   duration              2707 non-null   object        
 6   channel_id            2707 non-null   object        
 7   nb_comments           2707 non-null   int64         
 8   nb_views              2707 non-null   int64         
 9   nb_likes              2707 non-null   int64         
 10  defaultAudioLanguage  1744 non-null   object        
 11  topicCategories       2508 non-null   object        
 12  topics                2508 non-null   object        
 13  del               

In [None]:
# Save the cleaned version of my video stats

df_video.to_csv("video_stats.csv")

# Comments

## Getting comments from the chosen video

In [4]:
def process_comments(response):
  all_data = []

  for comment in response["items"]:
    data = dict(
        channel_id = comment["snippet"]["channelId"],
        video_id = comment["snippet"]["videoId"],
        comment_id = comment["snippet"]["topLevelComment"]["id"],
        comment = comment["snippet"]["topLevelComment"]["snippet"]["textOriginal"],
        like = comment["snippet"]["topLevelComment"]["snippet"]["likeCount"],
        author = comment["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"],
        publishedAt = comment["snippet"]["topLevelComment"]["snippet"]["publishedAt"],
        parent_id = None
    )
    all_data.append(data)

    if 'replies' in comment.keys():
      for reply in comment['replies']['comments']:
        data = dict(
            channel_id = reply["snippet"]["channelId"],
            video_id = reply["snippet"]["videoId"],
            comment_id = reply["id"],
            comment = reply["snippet"]["textOriginal"],
            author = reply['snippet']['authorDisplayName'],
            like = reply["snippet"]["likeCount"],
            publishedAt = reply["snippet"]["publishedAt"],
            parent_id = reply["snippet"]["parentId"]
        )
        all_data.append(data)

  return all_data

In [5]:
def get_comment_threads(youtube, video_id):

    all_comments = []

    request = youtube.commentThreads().list(
      part = "snippet,replies",
      # allThreadsRelatedToChannelId=channel_id
      videoId = video_id
    )
    response = request.execute()
    all_comments.extend(process_comments(response))

    while response.get("nextPageToken",None):
        request = youtube.commentThreads().list(
          part = "snippet,replies",
          # allThreadsRelatedToChannelId=channel_id,
          videoId = video_id,
          pageToken = response["nextPageToken"]
        )
        response = request.execute()
        all_comments.extend(process_comments(response))

    return all_comments

In [6]:
video_comments = get_comment_threads(youtube, "NV1Kh7wvuAM")

In [7]:
df_comments = pd.DataFrame(video_comments)
df_comments.head()

Unnamed: 0,channel_id,video_id,comment_id,comment,like,author,publishedAt,parent_id
0,UCz8K1occVvDTYDfFo7N5EZw,NV1Kh7wvuAM,UgyF78L-ERyJb0bfyJt4AaABAg,"If you’re ever injured in an accident, you can...",416,@LawAndCrime,2024-04-25T14:25:25Z,
1,UCz8K1occVvDTYDfFo7N5EZw,NV1Kh7wvuAM,UgyF78L-ERyJb0bfyJt4AaABAg.A2eWnwtfiVYA2lZje06vFW,40:51 i think she meant to say “we no longer h...,19,@geminienergy,2024-04-28T08:05:43Z,UgyF78L-ERyJb0bfyJt4AaABAg
2,UCz8K1occVvDTYDfFo7N5EZw,NV1Kh7wvuAM,UgyF78L-ERyJb0bfyJt4AaABAg.A2eWnwtfiVYA2p-20Bnt1k,I absolutely agree with what you actually said...,10,@SH-hu4cl,2024-04-29T16:02:01Z,UgyF78L-ERyJb0bfyJt4AaABAg
3,UCz8K1occVvDTYDfFo7N5EZw,NV1Kh7wvuAM,UgyF78L-ERyJb0bfyJt4AaABAg.A2eWnwtfiVYA2pkRL8WCmw,You are.....,0,@ihateyoutubecomments8100,2024-04-29T23:04:54Z,UgyF78L-ERyJb0bfyJt4AaABAg
4,UCz8K1occVvDTYDfFo7N5EZw,NV1Kh7wvuAM,UgyF78L-ERyJb0bfyJt4AaABAg.A2eWnwtfiVYA2rW_BHL-z8,"I have no need at this time, but you have or s...",3,@cynthiatolman326,2024-04-30T15:33:31Z,UgyF78L-ERyJb0bfyJt4AaABAg


In [13]:
df_comments.drop_duplicates(inplace=True)

In [14]:
df_comments.reset_index(drop=True)
df_comments[df_comments.duplicated(keep=False)]

Unnamed: 0,channel_id,video_id,comment_id,comment,like,author,publishedAt,parent_id


In [15]:
df_comments[df_comments.duplicated(subset=['comment_id'],keep=False)]

Unnamed: 0,channel_id,video_id,comment_id,comment,like,author,publishedAt,parent_id


In [16]:
df_comments.to_csv("video_comments.csv")