# Fetching YouTube Comments for COVID-19 Mask-Wearing Analysis (US)

This notebook aims to fetch YouTube comments from videos posted by popular US news publishers (ABC News, CNN, Fox News) that discuss COVID-19. The comments will be analyzed for sentiment and emotional intensity related to mask-wearing.

## Import Libraries

In [None]:
import os
import re
import pandas as pd
from googleapiclient.discovery import build

## Define YouTube Comments Fetcher Class

In [None]:
class YouTubeCommentsFetcher:
    _instance = None

    def __new__(cls, api_key: str):
        if cls._instance is None:
            cls._instance = super(YouTubeCommentsFetcher, cls).__new__(cls)
            cls._instance.api_key = api_key
            cls._instance.youtube = build('youtube', 'v3', developerKey=api_key)
        return cls._instance

    def _extract_video_id(self, url):
        match = re.search(r'(?:v=)([a-zA-Z0-9_-]+)', url)
        return match.group(1) if match else None

    def get_video_details(self, video_id):
        video_request = self.youtube.videos().list(part="snippet", id=video_id)
        video_response = video_request.execute()
        if video_response['items']:
            snippet = video_response['items'][0]['snippet']
            video_title = snippet['title']
            news_publisher = snippet['channelTitle']
            event_date = snippet['publishedAt'][:10]
            return video_title, news_publisher, event_date
        return None, None, None

    def get_comments_for_video(self, video_url, country, include_replies=False):
        video_id = self._extract_video_id(video_url)
        if not video_id:
            raise ValueError("Invalid video URL")

        video_title, news_publisher, event_date = self.get_video_details(video_id)
        if not video_title:
            raise ValueError("Unable to retrieve video details")

        if not (news_publisher in ['CNN', 'ABC News', 'Fox News']):
            return []

        all_comments = []
        next_page_token = None

        def contains_mask_keywords(comment):
            comment_lower = str(comment).lower()
            return "wear" in comment_lower and "mask" in comment_lower

        while True:
            comment_request = self.youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=next_page_token,
                textFormat="plainText",
                maxResults=100
            )
            comment_response = comment_request.execute()

            for item in comment_response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comment_text = top_comment['textDisplay']
                comment_timestamp = top_comment['publishedAt'][:10]

                if not contains_mask_keywords(comment_text):
                    continue
                if not ('2020-01-01' <= comment_timestamp <= '2022-12-31'):
                    continue

                all_comments.append({
                    'country': country,
                    'news_publisher': news_publisher,
                    'event_date': event_date,
                    'news_url': video_url,
                    'news_title': video_title,
                    'comment_text': comment_text,
                    'comment_timestamp': comment_timestamp
                })

                if include_replies and item['snippet']['totalReplyCount'] > 0:
                    all_comments.extend(self.get_replies(item['snippet']['topLevelComment']['id'], video_id, video_title, country, news_publisher, event_date, video_url))

            next_page_token = comment_response.get('nextPageToken')
            if not next_page_token:
                break

        return all_comments

    def search_videos_by_channel_and_keyword(self, channel_id, keyword, max_videos=30):
        videos = []
        next_page_token = None
        video_count = 0

        while video_count < max_videos:
            search_request = self.youtube.search().list(
                part="snippet",
                channelId=channel_id,
                q=keyword,  
                maxResults=50,  
                pageToken=next_page_token,
                type="video"
            )
            search_response = search_request.execute()
            search_videos = search_response['items']

            for video in search_videos:
                video_title = video['snippet']['title']
                video_id = video['id']['videoId']
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                videos.append({'video_id': video_id, 'video_title': video_title, 'video_url': video_url})
                video_count += 1
                if video_count >= max_videos:
                    break

            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                break

        return videos

## Initialize Fetcher

In [None]:
api_key = 'YOUR_API_KEY'
fetcher = YouTubeCommentsFetcher(api_key)

## Define Channel IDs and Keyword

In [None]:
news_channel_id = "YOUR_NEWS_CHANNEL_ID"
keyword = "covid mask"
country = "US"

## Search Videos by Channel and Keyword

In [None]:
news_videos = fetcher.search_videos_by_channel_and_keyword(news_channel_id, keyword)
searched_videos_urls = set(video['video_url'] for video in news_videos)

## Fetch Comments

In [None]:
csv_file = 'mask_comments_us.csv'
if os.path.exists(csv_file):
    news_df = pd.read_csv(csv_file)
    print('Original dataframe shape:', news_df.shape)
    news_urls = news_df['news_url'].tolist()
else:
    news_df = pd.DataFrame(columns=['country', 'news_publisher', 'event_date', 'news_url', 'news_title', 'comment_text', 'comment_timestamp'])
    news_df.to_csv(csv_file, index=False)
    news_urls = []

no_comments_file = 'junk_us_news_url.csv'
if os.path.exists(no_comments_file):
    no_comments_url_df = pd.read_csv(no_comments_file)
else:
    no_comments_url_df = pd.DataFrame(columns=['news_url'])

number_of_fetched_comments = 0
fetched_comments = []

for video_url in searched_videos_urls:
    if video_url in news_urls or video_url in no_comments_url_df['news_url'].tolist():
        print(f'Comments for {video_url} already fetched. Skipping...')
    else:
        print(f'Fetching comments for {video_url}...')
        try:
            comments = fetcher.get_comments_for_video(video_url, country)
        except Exception as e:
            print(f'Comments for {video_url} are not accessible. Skipping...')
            no_comments_url_df = pd.concat([no_comments_url_df, pd.DataFrame({'news_url': [video_url]})], ignore_index=True)
            continue

        if not comments:
            print('No comments found.')
            no_comments_url_df = pd.concat([no_comments_url_df, pd.DataFrame({'news_url': [video_url]})], ignore_index=True)
        else:
            new_comments_df = pd.DataFrame(comments)
            new_comments_df['comment_text'] = new_comments_df['comment_text'].str.slice(0, 255)
            new_comments_df['comment_timestamp'] = pd.to_datetime(new_comments_df['comment_timestamp'])

            print(f'Number of comments found in {video_url}: {new_comments_df.shape[0]}')

            number_of_fetched_comments += new_comments_df.shape[0]
            fetched_comments.append(new_comments_df)

no_comments_url_df.to_csv(no_comments_file, index=False)

if fetched_comments:
    print("Preview of new comments DataFrame:")
    all_new_comments_df = pd.concat(fetched_comments)
    print(all_new_comments_df.shape)
    print(all_new_comments_df.head())

    add_comments = input("Do you want to add these comments to the existing DataFrame? (yes/no): ")
    if add_comments.lower() == 'yes':
        news_df = pd.concat([news_df, all_new_comments_df], ignore_index=True)
        news_df.to_csv(csv_file, index=False)
        print(f'Total number of new comments added: {number_of_fetched_comments}')
    else:
        print("No new comments added.")
else:
    print("No new comments found.")

news_df = pd.read_csv(csv_file)
news_df = news_df.drop_duplicates()
print("Final dataframe shape:", news_df.shape)
news_df.to_csv(csv_file, index=False)
print("Duplicates removed, final DataFrame saved.")