<a href="https://colab.research.google.com/github/Us9r/vault/blob/main/scraping/pull_all_comments_and_replies_for_youtube_playlists.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [1]:
from googleapiclient.discovery import build
import pandas as pd
from google.colab import files, drive
import getpass

## User Input

In [2]:
api_key = getpass.getpass('Please enter your YouTube API key: ')
playlist_ids = ['PLtLw8q3zeedC8EjDD6xW4NZSIngqftkN9']


Please enter your YouTube API key: ··········


In [3]:
# Build the YouTube client
youtube = build('youtube', 'v3', developerKey=api_key)

## Get Video IDs for Playlist

In [4]:
def get_all_video_ids_from_playlists(youtube, playlist_ids):
    all_videos = []  # Initialize a single list to hold all video IDs

    for playlist_id in playlist_ids:
        next_page_token = None

        # Fetch videos from the current playlist
        while True:
            playlist_request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token)
            playlist_response = playlist_request.execute()

            all_videos += [item['contentDetails']['videoId'] for item in playlist_response['items']]

            next_page_token = playlist_response.get('nextPageToken')

            if next_page_token is None:
                break

    return all_videos

# Fetch all video IDs from the specified playlists
video_ids = get_all_video_ids_from_playlists(youtube, playlist_ids)

# Now you can pass video_ids to the next function
# next_function(video_ids)

In [8]:
video_ids

['BHkTABfl9fc']

## Get All Comments

In [10]:
# Function to get replies for a specific comment
def get_replies(youtube, parent_id, video_id):  # Added video_id as an argument
    replies = []
    # Removed pagination parameters for comments().list
    reply_request = youtube.comments().list(
        part="snippet",
        parentId=parent_id,
        textFormat="plainText"
    )
    reply_response = reply_request.execute()

    for item in reply_response['items']:
        comment = item['snippet']
        replies.append({
            'Timestamp': comment['publishedAt'],
            'Username': comment['authorDisplayName'],
            'VideoID': video_id,
            'Comment': comment['textDisplay'],
            'Date': comment['updatedAt'] if 'updatedAt' in comment else comment['publishedAt']
        })

    # Pagination is not supported for replies, so we return here
    return replies


# Function to get all comments (including replies) for a single video
def get_comments_for_video(youtube, video_id):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=15000
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            all_comments.append({
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'VideoID': video_id,  # Directly using video_id from function parameter
                'Comment': top_comment['textDisplay'],
                'Date': top_comment['updatedAt'] if 'updatedAt' in top_comment else top_comment['publishedAt']
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(youtube, item['snippet']['topLevelComment']['id'], video_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    return all_comments

# List to hold all comments from all videos
all_comments = []


for video_id in video_ids:
    video_comments = get_comments_for_video(youtube, video_id)
    all_comments.extend(video_comments)

# Create DataFrame
comments_df = pd.DataFrame(all_comments)


In [11]:
comments_df.head()

Unnamed: 0,Timestamp,Username,VideoID,Comment,Date
0,2024-12-06T03:18:51Z,@AdriantDrian,BHkTABfl9fc,Bus Bus Lintas Sumatera Jawa Kalimantan sudah ...,2024-12-06T03:20:15Z
1,2024-12-05T04:43:30Z,@saddestboy2966,BHkTABfl9fc,rekomendasi power bank untuk starlink dong,2024-12-05T04:43:30Z
2,2024-12-03T21:39:58Z,@Dion-ry8td,BHkTABfl9fc,lanjut bang,2024-12-03T21:39:58Z
3,2024-12-03T06:45:53Z,@tomotomo4066,BHkTABfl9fc,Saya pengin beli tapi mau saya jual lagi. Kala...,2024-12-03T06:45:53Z
4,2024-12-02T15:26:40Z,@nicocs3316,BHkTABfl9fc,"Wahh,,, bagus juga nih untuk yg tinggal di pel...",2024-12-02T15:26:40Z


In [12]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10828 entries, 0 to 10827
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Timestamp  10828 non-null  object
 1   Username   10828 non-null  object
 2   VideoID    10828 non-null  object
 3   Comment    10828 non-null  object
 4   Date       10828 non-null  object
dtypes: object(5)
memory usage: 423.1+ KB


### Output to CSV

In [13]:
# Export whole dataset to the local machine as CSV File
csv_file = 'comments_with_replies.csv'  # Name your file
comments_df.to_csv(csv_file, index=False)

from google.colab import files

# Trigger a download to your local machine
files.download(csv_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>