In [None]:
import os

from googleapiclient.discovery import build

import time
import json
from typing import Dict
from google.cloud import pubsub_v1

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(r"C:\Users\Subrahmanya Joshi\Documents\project_data\credentials\key.json")

In [None]:
class PubSubPublisher(object):

    def __init__(self, project_id: str, topic_id: str):
        self.publisher = pubsub_v1.PublisherClient()
        self.topic_path = self.publisher.topic_path(project_id, topic_id)

    def publish(self, message: Dict):

        # Convert dictionary to a json string
        message = json.dumps(message)

        # Data must be a byte string
        data = message.encode("utf-8")
        # When you publish a message, the client returns a future.
        _ = self.publisher.publish(self.topic_path, data)

In [None]:
publisher = PubSubPublisher(project_id='text-analysis-323506', topic_id='yt-comments')

In [None]:
# Arguments that need to passed to the build function
# API Key needs to be created in Google Cloud Console.- https://console.cloud.google.com/apis/api/youtube.googleapis.com/metrics
DEVELOPER_KEY = ''
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
   
# creating Youtube Resource Object
youtube_object = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey = DEVELOPER_KEY)

In [None]:
# Enter search keyword
keyword = "Elon Musk"

In [None]:
# Search 
search_keyword = youtube_object.search().list(q = keyword, part = "id, snippet", maxResults = 100).execute()
results = search_keyword.get("items", [])

### Extract information about videos, playlists, channels

In [None]:
videos = []
playlists = []
channels = []

for result in results:
    # videos
    if result['id']['kind'] == "youtube#video":
        videos.append({'title': result["snippet"]["title"],
                       'id': result["id"]["videoId"],
                       'decription': result['snippet']['description'],
                       'url': result['snippet']['thumbnails']['default']['url']})

    # playlists
    elif result['id']['kind'] == "youtube#playlist":
        playlists.append({'title': result["snippet"]["title"],
                          'id': result["id"]["playlistId"],
                          'decription': result['snippet']['description'],
                          'url': result['snippet']['thumbnails']['default']['url']})

    # channels
    elif result['id']['kind'] == "youtube#channel":
        channels.append({'title': result["snippet"]["title"],
                         'id': result["id"]["channelId"],
                         'decription': result['snippet']['description'],
                         'url': result['snippet']['thumbnails']['default']['url']})

In [None]:
print(f"Videos: {len(videos)}\nPlaylists: {len(playlists)}\nChannels: {len(channels)}")

### Extract Comments and replies from videos

In [None]:
def get_comment_threads(youtube, video_id, nextPageToken):
    results = youtube.commentThreads().list(
        part="snippet,replies",
        maxResults=100,
        videoId=video_id,
        textFormat="plainText",
        pageToken = nextPageToken
    ).execute()
    return results

In [None]:
def load_comments(video_response, video_id):
    
    comms = []
    for item in video_response['items']:

        # Extracting comments
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        user_name = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
        user_url = item['snippet']['topLevelComment']['snippet']['authorChannelUrl']
        user_id = item['snippet']['topLevelComment']['snippet']['authorChannelId'].get('value')
        timestamp = item['snippet']['topLevelComment']['snippet']['publishedAt']
        comms.append({'timestamp': timestamp, 'text': comment, 'user_name': user_name, 'user_id': user_id, 
                      'user_profile': user_url, 'video_id': video_id})

        # counting number of reply of comment
        replycount = item['snippet']['totalReplyCount']

        if replycount > 0:
            if 'replies' not in item:
                continue
            for reply in item['replies']['comments']:
                comment = reply['snippet']['textDisplay']
                user_name = reply['snippet']['authorDisplayName']
                user_url = reply['snippet']['authorChannelUrl']
                user_id = reply['snippet']['authorChannelId'].get('value')
                timestamp = reply['snippet']['publishedAt']
                comms.append({'timestamp': timestamp, 'text': comment, 'user_name': user_name, 'user_id': user_id, 
                      'user_profile': user_url, 'video_id': video_id})
    return comms

In [None]:
comments = []

# Get comments from only top 10 search results
for video in videos[:1]:
    
    video_id = video['id']
    video_response=get_comment_threads(youtube_object, video_id, '')
    next_page_token = video_response.get("nextPageToken", None)
    comments += load_comments(video_response, video_id)
    
    while next_page_token:
        video_response=get_comment_threads(youtube_object, video_id, next_page_token)
        next_page_token = video_response.get("nextPageToken", None)
        comments += load_comments(video_response, video_id)

In [None]:
len(comments)

### Publish comments to PubSub topic

In [None]:
for comment in comments:
    print(comment['text'])
    publisher.publish(message=comment)
    time.sleep(5)