# GetYoutubeComments

This code performs the extraction and processing of YouTube comments using the *YouTube API*. First, the video IDs for different election cycles are defined. Then the YouTube Data API v3 is used to retrieve comments and the extracted data is stored in a csv for each election cycle.

*Important: An API key is required to execute the code (see .env.example)*

# Imports, API and Videos

In [1]:
# Import Modules
import googleapiclient.discovery
import pandas as pd
import csv
import os
from dotenv import load_dotenv

In [2]:
# API Credentials
api_service_name = 'youtube'
api_version = 'v3'

# Load environment variables
load_dotenv()
DEVELOPER_KEY = os.getenv('DEVELOPER_KEY')

# YouTube API client initalisieren
youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=DEVELOPER_KEY)


In [3]:
# Video for testing purposes
video_test = ['u7JMhVI7taQ']

In [4]:
# List per year with all videos that meet the defined criteria.

# videos_2016 = ['SHG0ezLiVGc', '-nQGBZQrtT0','qVMW_1aZXRk','--IS0XiNdpk','-rSDUsMwakI','Qg0pO9VG1J8','BG-_ZDrypec','0SLfCkXDAf8','iXZLnhHhW8A','WLYHu0AG8GI','s7gDXtRS0jo','RXMp9fBomJw','TlqKFlU7YAs','UiWY0iRLV94','9mYVi7WHyiU','nRp1CK_X_Yw','Qsvy10D5rtc','bEskg0Z-NAQ','pfmwGAd1L-o','_tOhUmNpTJE','-TTiDlK4vS8','Pe52Tpjo3AE','pjS6OdY2dBQ', 'e788fOzSSek','9Z4lQ4k_BQ8','khK9fIgoNjQ','KJ1FLhcMDMM','-sslP9LInEM']
# videos_2020 = ['Wsije1KetVw','aj7ELbw8FxM','Oj9GpZB9cuc','bPiofmZGb8o','wW1lY5jFNcQ','xI_lxFv203I','LyC855KdBKo','UCA1A5GqCdQ','ozGr4IsTUng','yGPfKkjDIts','cMz_sTgoydQ','moOxq_8l_34','7Icu6qupf40','ajavsMbCapY','5cathmZFeXs','K8Z9Kqhrh5c','un9x-DjTMT0','P69xdkqBJno','CUJhtZZWZZI','-ZDZtBRTyeI','idPv9zAkL48','U4nBnuv9n9o','AytDzZ2ecCc','c2ScxGsB-ks','Wbj5sDWluEA','xH9Cq3d67fc','7ovJyMwGbb8','UXR_bqyAy4E','CPYFiaL6cTE','mreNn_2-QJ8','KdJ8SDOFrdw']
# videos_2024 = ['g34CFjEFZts','b607aDHUu2I','LS8amqeDbS8','cJKtmTNf8qU','XLiagIdA84c','WI9fbbQ-aTo','oV4sdCSB6ME','5JxELubSgJg','LIV9HM5Yj80','e-1pJkuFTpA','RMprlMfsHAU','eFCiva4m3Zk','uT9s4BXcv6w','qqG96G8YdcE','gK6MD_8S01o','SGRydccYp0c','fmZJ7lhroA8','m1YwZfgJW0s','mrUAHQkLKtY','3SJr44m-w1Y','-v-8wJkmwBY','VgsC_aBquUE','kRh6598RmHM']

# Shorten to one video per year for testing
videos_2016 = ['SHG0ezLiVGc']
videos_2020 = ['Wsije1KetVw']
videos_2024 = ['g34CFjEFZts']

# Configuration of YouTube Data API
The functions (check_commets) checks whether the comments for a video are publicly available. The second function (get_comments_by_video_id) retrieves all comments including replies for the passed list of videos. The third function (get_video_details_by_ids) retrieves the metadata for the transferred videos.

In [5]:
# Check if comments are available for the given video IDs
def check_comments(video_ids):
   
    # List for storing video IDs with and without comments
    videos_with_comments = []
    videos_without_comments = []

    for video_id in video_ids:
        request = youtube.commentThreads().list(
            part='id',
            videoId=video_id,
            maxResults=1  # Only one result is needed to check if comments are available
        )
        
        try:
            response = request.execute()

            # Check if comments are available
            if 'items' in response and len(response['items']) > 0:
                videos_with_comments.append(video_id)
            else:
                videos_without_comments.append(video_id)
        except Exception as e:
            # If an error occurs, it may be that comments are disabled
            if 'disabled comments' in str(e) or 'comments are turned off' in str(e):
                videos_without_comments.append(video_id)
            else:
                print(f'Error with video ID {video_id}: {e}')

    return videos_with_comments, videos_without_comments

In [6]:
# Function to retrieve comments and replies for a list of video IDs

def get_comments_by_video_id(video_ids, DEVELOPER_KEY):
    results = []
    total_videos = len(video_ids)
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=DEVELOPER_KEY)

    for idx, video_id in enumerate(video_ids, start=1):
        print(f'Video {idx}/{total_videos} - ID: {video_id}')

        try:
            request = youtube.commentThreads().list(
                part='snippet,replies',
                videoId=video_id,
                maxResults=100
            )

            while request:
                response = request.execute()

                for item in response.get('items', []):
                    # Vater Kommentar
                    parent_comment = item['snippet']['topLevelComment']['snippet']
                    parent_comment_id = item['snippet']['topLevelComment']['id']

                    results.append({
                        'comment_ID': parent_comment_id,
                        'parent_ID': None,
                        'video_ID': video_id,
                        'author': parent_comment.get('authorDisplayName'),
                        'text': parent_comment.get('textDisplay'),
                        'like_count': parent_comment.get('likeCount'),
                        'published_At': parent_comment.get('publishedAt')
                    })

                    # Antworten des Vaterkommentars prüfen
                    if 'replies' in item:
                        for reply in item['replies']['comments']:
                            results.append({
                                'comment_ID': reply['id'],
                                'parent_ID': parent_comment_id,
                                'video_ID': video_id,
                                'author': reply['snippet'].get('authorDisplayName'),
                                'text': reply['snippet'].get('textDisplay'),
                                'like_count': reply['snippet'].get('likeCount'),
                                'published_At': reply['snippet'].get('publishedAt')
                            })

                # Nächste Seite
                request = youtube.commentThreads().list_next(request, response)

        except googleapiclient.errors.HttpError as e:
            print(f'Fehler bei Video ID {video_id}: {e}')
            continue

    df = pd.DataFrame(results)
    return df

In [7]:
# Function to retrieve video details by video IDs
def get_video_details_by_video_ids(video_ids, DEVELOPER_KEY):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=DEVELOPER_KEY)
    
    results = []

    for video_id in video_ids:
        request = youtube.videos().list(
            part='snippet,statistics',
            id=video_id
        )
        response = request.execute()

        # Überprüfe, ob das Video gefunden wurde
        if 'items' in response and len(response['items']) > 0:
            video_title = response['items'][0]['snippet']['title']
            channel_id = response['items'][0]['snippet']['channelId']
            channel_title = response['items'][0]['snippet']['channelTitle']
            view_count = response['items'][0]['statistics'].get('viewCount', 0)
            comment_count = response['items'][0]['statistics'].get('commentCount', 0)
            published_at = response['items'][0]['snippet']['publishedAt']  

            # Speichere die Ergebnisse in der Liste
            results.append({
                'video_Name': video_title,
                'video_ID': video_id,
                'publisher': channel_title,
                'view_Count': view_count,
                'comment_Count': comment_count,
                'published_at': published_at 
            })
        else: # Falls ein Video nicht gefunden wird
            results.append({
                'video_Name': 'Not Found',
                'video_ID': video_id,
                'publisher': 'N/A',
                'view_Count': 'N/A',
                'comment_Count': 'N/A',
                'published_at': 'N/A'
            })
            
    df = pd.DataFrame(results)
    return df

# Get YouTube Comments and Video Details for the diffrent videos in predefined list
The comments are read out with the previously defined functions (first test, then the three selection cycles) and finally saved in individual csv files

### Testing with just one Video

In [8]:
video_test_with_comments, video_test_without_comments = check_comments(video_test)

print('Videos with public comments:', video_test_with_comments)
print('Videos with disabled comments:', video_test_without_comments)

Videos with public comments: ['u7JMhVI7taQ']
Videos with disabled comments: []


In [9]:
df_test = pd.DataFrame(get_comments_by_video_id(video_test_with_comments, DEVELOPER_KEY))
df_test_videos = pd.DataFrame(get_video_details_by_video_ids(video_test_with_comments,DEVELOPER_KEY))

Video 1/1 - ID: u7JMhVI7taQ


In [10]:
# Ersetze Zeilenumbrüche im Text durch ein Platzhalterzeichen, um sie zu verhindern
df_test['text'] = df_test['text'].apply(lambda x: x.replace('\n', ' ').replace('\r', ' '))
df_test['custom_index'] = range(1, len(df_test) + 1)

In [11]:
df_test.to_csv('Export_csv/test_comments.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index=False, columns=['custom_index', 'comment_ID', 'parent_ID', 'video_ID', 'author', 'text', 'like_count', 'published_At'])
df_test_videos.to_csv('Export_csv/test_videos.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)

## 2016

In [12]:
videos_with_comments_2016, videos_without_comments_2016 = check_comments(videos_2016)

print('Videos with public comments:', videos_with_comments_2016)
print('Videos with disabled comments:', videos_without_comments_2016)

Videos with public comments: ['SHG0ezLiVGc']
Videos with disabled comments: []


In [13]:
# 2016 Comments 
df_2016_comments = pd.DataFrame(get_comments_by_video_id(videos_with_comments_2016, DEVELOPER_KEY))
df_2016_comments.info()
# 2016 videos
df_2016_videos = pd.DataFrame(get_video_details_by_video_ids(videos_with_comments_2016,DEVELOPER_KEY))

Video 1/1 - ID: SHG0ezLiVGc
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2825 entries, 0 to 2824
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_ID    2825 non-null   object
 1   parent_ID     425 non-null    object
 2   video_ID      2825 non-null   object
 3   author        2825 non-null   object
 4   text          2825 non-null   object
 5   like_count    2825 non-null   int64 
 6   published_At  2825 non-null   object
dtypes: int64(1), object(6)
memory usage: 154.6+ KB


In [14]:
# Ersetze Zeilenumbrüche im Text durch ein Platzhalterzeichen, um sie zu verhindern
df_2016_comments['text'] = df_2016_comments['text'].apply(lambda x: x.replace('\n', ' ').replace('\r', ' '))
df_2016_comments['custom_index'] = range(1, len(df_2016_comments) + 1)

In [15]:
df_2016_comments.to_csv('Export_csv/YouTube_Exporte/export_comments_2016.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index=False, columns=['custom_index', 'comment_ID', 'parent_ID', 'video_ID', 'author', 'text', 'like_count', 'published_At'])
df_2016_videos.to_csv('Export_csv/YouTube_Exporte/export_videos_2016.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)

## 2020 

In [16]:
videos_with_comments_2020, videos_without_comments_2020 = check_comments(videos_2020)

print('Videos with public comments:', videos_with_comments_2020)
print('Videos with disabled comments:', videos_without_comments_2020)

Videos with public comments: ['Wsije1KetVw']
Videos with disabled comments: []


In [17]:
# 2020 Comments 
df_2020_comments = pd.DataFrame(get_comments_by_video_id(videos_with_comments_2020, DEVELOPER_KEY))
df_2020_comments.info()

# 2020 videos
df_2020_videos = pd.DataFrame(get_video_details_by_video_ids(videos_with_comments_2020,DEVELOPER_KEY))

Video 1/1 - ID: Wsije1KetVw
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53648 entries, 0 to 53647
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_ID    53648 non-null  object
 1   parent_ID     12082 non-null  object
 2   video_ID      53648 non-null  object
 3   author        53648 non-null  object
 4   text          53648 non-null  object
 5   like_count    53648 non-null  int64 
 6   published_At  53648 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.9+ MB


In [18]:
# Ersetze Zeilenumbrüche im Text durch ein Platzhalterzeichen, um sie zu verhindern
df_2020_comments['text'] = df_2020_comments['text'].apply(lambda x: x.replace('\n', ' ').replace('\r', ' '))
df_2020_comments['custom_index'] = range(1, len(df_2020_comments) + 1)

In [19]:
df_2020_comments.to_csv('Export_csv/YouTube_Exporte/export_comments_2020.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index=False, columns=['custom_index', 'comment_ID', 'parent_ID', 'video_ID', 'author', 'text', 'like_count', 'published_At'])
df_2020_videos.to_csv('Export_csv/YouTube_Exporte/export_videos_2020.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)

## 2024

In [20]:
videos_with_comments_2024, videos_without_comments_2024 = check_comments(videos_2024)

print('Videos with public comments:', videos_with_comments_2024)
print('Videos with disabled comments:', videos_without_comments_2024)

Videos with public comments: ['g34CFjEFZts']
Videos with disabled comments: []


In [21]:
# 2024 Comments 
df_2024_comments = pd.DataFrame(get_comments_by_video_id(videos_with_comments_2024, DEVELOPER_KEY))
df_2024_comments.info()

# 2024 videos
df_2024_videos = pd.DataFrame(get_video_details_by_video_ids(videos_with_comments_2024,DEVELOPER_KEY))

Video 1/1 - ID: g34CFjEFZts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5657 entries, 0 to 5656
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_ID    5657 non-null   object
 1   parent_ID     1573 non-null   object
 2   video_ID      5657 non-null   object
 3   author        5657 non-null   object
 4   text          5657 non-null   object
 5   like_count    5657 non-null   int64 
 6   published_At  5657 non-null   object
dtypes: int64(1), object(6)
memory usage: 309.5+ KB


In [22]:
# Ersetze Zeilenumbrüche im Text durch ein Platzhalterzeichen, um sie zu verhindern
df_2024_comments['text'] = df_2024_comments['text'].apply(lambda x: x.replace('\n', ' ').replace('\r', ' '))
df_2024_comments['custom_index'] = range(1, len(df_2024_comments) + 1)

In [23]:
df_2024_comments.to_csv('Export_csv/YouTube_Exporte/export_comments_2024.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index=False, columns=['custom_index', 'comment_ID', 'parent_ID', 'video_ID', 'author', 'text', 'like_count', 'published_At'])
df_2024_videos.to_csv('Export_csv/YouTube_Exporte/export_videos_2024.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)