# Thai YouTube Comments Fetching Notebook

This notebook demonstrates how to fetch Thai YouTube comments from videos related to COVID-19 mask-wearing using various methods:
1. Fetching videos by keyword
2. Fetching videos by playlist URL
3. Fetching videos by YouTube channel

## Setup

### Import Libraries

In [None]:
import os
import re
import pandas as pd
from googleapiclient.discovery import build
from pythainlp.tokenize import word_tokenize

## Define Youtube Comments Fetcher Class

In [None]:
class YouTubeCommentsFetcher:
    _instance = None

    def __new__(cls, api_key: str):
        if cls._instance is None:
            cls._instance = super(YouTubeCommentsFetcher, cls).__new__(cls)
            cls._instance.api_key = api_key
            cls._instance.youtube = build('youtube', 'v3', developerKey=api_key)
        return cls._instance

    def _extract_playlist_id(self, url):
        match = re.search(r'(?:list=)([a-zA-Z0-9_-]+)', url)
        return match.group(1) if match else None

    def _extract_video_id(self, url):
        match = re.search(r'(?:v=)([a-zA-Z0-9_-]+)', url)
        return match.group(1) if match else None

    def get_video_title(self, video_id):
        video_request = self.youtube.videos().list(
            part="snippet",
            id=video_id
        )
        video_response = video_request.execute()
        return video_response['items'][0]['snippet']['title'] if video_response['items'] else None

    def get_all_video_ids_from_playlists(self, playlist_urls):
        playlist_ids = [self._extract_playlist_id(url) for url in playlist_urls]
        playlist_ids = [pid for pid in playlist_ids if pid]
        all_videos = []

        for playlist_id in playlist_ids:
            next_page_token = None
            while True:
                playlist_request = self.youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId=playlist_id,
                    maxResults=50,
                    pageToken=next_page_token
                )
                playlist_response = playlist_request.execute()

                all_videos += [item['contentDetails']['videoId'] for item in playlist_response['items']]
                next_page_token = playlist_response.get('nextPageToken')

                if not next_page_token:
                    break

        return all_videos

    def get_video_details(self, video_id):
        video_request = self.youtube.videos().list(
            part="snippet",
            id=video_id
        )
        video_response = video_request.execute()
        if video_response['items']:
            snippet = video_response['items'][0]['snippet']
            video_title = snippet['title']
            news_publisher = snippet['channelTitle']
            event_year = snippet['publishedAt'][:10]
            return video_title, news_publisher, event_year
        return None, None, None

    def get_comments_for_video(self, video_url, country, include_replies=False):
        video_id = self._extract_video_id(video_url)
        if not video_id:
            raise ValueError("Invalid video URL")

        video_title, news_publisher, event_year = self.get_video_details(video_id)
        if not video_title:
            raise ValueError("Unable to retrieve video details")

        all_comments = []
        next_page_token = None

        mask_keywords = ["mask", "wear mask"]

        if country == "TH":
            mask_keywords = [
                "หน้ากาก", "มาส์ก", "ผ้าปิดหน้า", "แมส",
                "มาส์ค", "การสวมหน้ากาก", "หน้ากากอนามัย", "แมสก์"
            ]

            def contains_mask_keywords(comment):
                tokens = word_tokenize(comment, engine='newmm', keep_whitespace=False)
                return any(keyword in tokens for keyword in mask_keywords)
        else:
            def contains_mask_keywords(comment):
                comment_lower = str(comment).lower()
                return any(keyword in comment_lower for keyword in mask_keywords)

        while True:
            comment_request = self.youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=next_page_token,
                textFormat="plainText",
                maxResults=100
            )
            comment_response = comment_request.execute()

            for item in comment_response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comment_text = top_comment['textDisplay']

                if not contains_mask_keywords(comment_text):
                    continue

                all_comments.append({
                    'country': country,
                    'news_publisher': news_publisher,
                    'event_year': event_year,
                    'news_url': video_url,
                    'news_title': video_title,
                    'comment_text': comment_text,
                    'comment_timestamp': top_comment['publishedAt']
                })

                if include_replies and item['snippet']['totalReplyCount'] > 0:
                    all_comments.extend(self.get_replies(item['snippet']['topLevelComment']['id'], video_id, video_title, country, news_publisher, event_year, video_url, mask_keywords))

            next_page_token = comment_response.get('nextPageToken')
            if not next_page_token:
                break

        return all_comments

    def search_videos(self, keyword: str, country: str):
        all_videos = []
        next_page_token = None

        while True:
            search_request = self.youtube.search().list(
                part="snippet",
                q=keyword,
                regionCode=country,
                type="video",
                maxResults=50,
                pageToken=next_page_token
            )
            search_response = search_request.execute()
            for item in search_response['items']:
                video_id = item['id']['videoId']
                video_title = item['snippet']['title']
                channel_title = item['snippet']['channelTitle']

                keywords = ["ข่าว", "top news", "spring news", "nationtv", "MCOT", "ช่อง", "thairath", "TNN", "Thai PBS", "เรื่องเล่า", "matichon tv", "สำนักข่าว", "voice", "amarin", "mono29", "ch7hd", "pptv", "Bright TV", "one31"]
                if any(keyword.lower() in channel_title.lower() for keyword in keywords):
                    video_url = f"https://www.youtube.com/watch?v={video_id}"
                    all_videos.append({'video_id': video_id, 'video_title': video_title, 'video_url': video_url})

            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                break

        return all_videos

    def get_channel_id_by_username(self, username):
        request = self.youtube.search().list(
            part='snippet',
            q=username,
            type='channel'
        )
        response = request.execute()
        if response['items']:
            return response['items'][0]['snippet']['channelId']
        return None

    def get_all_videos_from_channel(self, channel_id):
        videos = []
        next_page_token = None

        while True:
            search_request = self.youtube.search().list(
                part="snippet",
                channelId=channel_id,
                maxResults=50,
                pageToken=next_page_token,
                type="video"
            )
            search_response = search_request.execute()
            videos += search_response['items']
            next_page_token = search_response.get('nextPageToken')

            if not next_page_token:
                break

        return videos

    def search_videos_by_channel_and_keyword(self, channel_id, keyword):
        videos = self.get_all_videos_from_channel(channel_id)
        filtered_videos = []

        for video in videos:
            video_title = video['snippet']['title']
            if keyword.lower() in video_title.lower():
                video_id = video['id']['videoId']
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                filtered_videos.append({'video_id': video_id, 'video_title': video_title, 'video_url': video_url})

        return filtered_videos

### Intialize Fetcher

In [None]:
api_key = 'YOUR_API_KEY'
fetcher = YouTubeCommentsFetcher(api_key)

## Fetching Videos by Keyword

In [None]:
keyword = "หน้ากากอนามัย"
country = "TH"

videos_by_keyword = fetcher.search_videos(keyword, country)
searched_videos_urls = set(video['video_url'] for video in videos_by_keyword)

## Fetching Videos by Playlist URL

In [None]:
playlist_urls = [
    'https://www.youtube.com/playlist?list=PLRS4T4F2sF1q9BL2_p0xNnDrkUlibC3Rd'
]

all_video_ids = fetcher.get_all_video_ids_from_playlists(playlist_urls)[:100]
all_video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in all_video_ids]

In [None]:
csv_file = 'mask_comments_th.csv'
if os.path.exists(csv_file):
    news_df = pd.read_csv(csv_file)
    news_urls = news_df['news_url'].tolist()
else:
    news_df = pd.DataFrame(columns=['country', 'news_publisher', 'event_date', 'news_url', 'news_title', 'comment_text', 'comment_timestamp'])
    news_df.to_csv(csv_file, index=False)
    news_urls = []

number_of_new_comments = 0
new_comments_dfs = []

for video_url in all_video_urls:
    if video_url in news_urls:
        print(f'Comments for {video_url} already fetched.')
    else:
        print(f'Fetching comments for {video_url}...')
        try:
            comments = fetcher.get_comments_for_video(video_url, 'TH')
        except Exception as e:
            print(f'Comments for {video_url} are not accessible. Skipping...')
            continue

        if not comments:
            print('No comments found.')
        else:
            new_comments_df = pd.DataFrame(comments)
            new_comments_df['comment_text'] = new_comments_df['comment_text'].str.slice(0, 255)
            new_comments_df['comment_timestamp'] = pd.to_datetime(new_comments_df['comment_timestamp'])

            print(f'Number of comments found in {video_url}: {new_comments_df.shape[0]}')

            number_of_new_comments += new_comments_df.shape[0]
            new_comments_dfs.append(new_comments_df)

if new_comments_dfs:
    print("Preview of new comments DataFrame:")
    all_new_comments_df = pd.concat(new_comments_dfs)
    print(all_new_comments_df.shape)
    print(all_new_comments_df.head())

    add_comments = input("Do you want to add these comments to the existing DataFrame? (yes/no): ")
    if add_comments.lower() == 'yes':
        news_df = pd.concat([news_df, all_new_comments_df], ignore_index=True)
        news_df.to_csv(csv_file, index=False)
        print(f'Total number of new comments added: {number_of_new_comments}')
    else:
        print("No new comments added.")
else:
    print("No new comments found.")

thai_comments = pd.read_csv(csv_file)
thai_comments = thai_comments.drop_duplicates()
thai_comments.to_csv(csv_file, index=False)
thai_comments

## Fetching Videos by Youtube Channel

In [None]:
channel_username = '@ThaiPBS'
channel_id = fetcher.get_channel_id_by_username(channel_username)
print(f"Channel ID for {channel_username}: {channel_id}")

In [None]:
keyword = 'โควิด หน้ากาก'
search_results = fetcher.search_videos_by_channel_and_keyword(channel_id, keyword)
thai_news_urls = set([result['video_url'] for result in search_results])

In [None]:
csv_file = 'mask_comments_th.csv'
if os.path.exists(csv_file):
    news_df = pd.read_csv(csv_file)
    news_urls = news_df['news_url'].tolist()
else:
    news_df = pd.DataFrame(columns=['country', 'news_publisher', 'event_date', 'news_url', 'news_title', 'comment_text', 'comment_timestamp'])
    news_df.to_csv(csv_file, index=False)
    news_urls = []

number_of_new_comments = 0
new_comments_dfs = []

for video_url in thai_news_urls:
    if video_url in news_urls:
        print(f'Comments for {video_url} already fetched.')
    else:
        print(f'Fetching comments for {video_url}...')
        try:
            comments = fetcher.get_comments_for_video(video_url, 'TH')
        except Exception as e:
            print(f'Comments for {video_url} are not accessible. Skipping...')
            continue

        if not comments:
            print('No comments found.')
        else:
            new_comments_df = pd.DataFrame(comments)
            new_comments_df['comment_text'] = new_comments_df['comment_text'].str.slice(0, 255)
            new_comments_df['comment_timestamp'] = pd.to_datetime(new_comments_df['comment_timestamp'])

            print(f'Number of comments found in {video_url}: {new_comments_df.shape[0]}')

            number_of_new_comments += new_comments_df.shape[0]
            new_comments_dfs.append(new_comments_df)

if new_comments_dfs:
    print("Preview of new comments DataFrame:")
    all_new_comments_df = pd.concat(new_comments_dfs)
    print(all_new_comments_df.shape)
    print(all_new_comments_df.head())

    add_comments = input("Do you want to add these comments to the existing DataFrame? (yes/no): ")
    if add_comments.lower() == 'yes':
        news_df = pd.concat([news_df, all_new_comments_df], ignore_index=True)
        news_df.to_csv(csv_file, index=False)
        print(f'Total number of new comments added: {number_of_new_comments}')
    else:
        print("No new comments added.")
else:
    print("No new comments found.")

thai_comments = pd.read_csv(csv_file)
thai_comments = thai_comments.drop_duplicates()
thai_comments.to_csv(csv_file, index=False)
thai_comments