In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from pymongo import MongoClient
from datetime import datetime, timedelta
import re

def service(api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)
    return youtube


def get_channel_data(youtube, channel_id):
    channel_data = {}
    try:
        request = youtube.channels().list(
            part='snippet, statistics, contentDetails, brandingSettings',
            id=channel_id
        )
        response = request.execute()
        channel_name = response['items'][0]['snippet']['title']
        subscription_count = int(response['items'][0]['statistics']['subscriberCount'])
        channel_views = int(response['items'][0]['statistics']['viewCount'])
        video_count = int(response['items'][0]['statistics']['videoCount'])
        channel_description = response['items'][0]['brandingSettings']['channel'].get('description',
                                                                                       'No description available')
        playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

        channel_data.update({
            'channel_name': channel_name,
            'channel_id': channel_id,
            'subscription_count': subscription_count,
            'channel_views': channel_views,
            'video_count': video_count,
            'channel_description': channel_description,
            'playlist_id': playlist_id
        })

        return channel_data
    except HttpError as e:
        print(f"An error occurred: {e}")
        return None

def vc_data(youtube, playlist_id):
    try:
        video_data = {}
        next_page_token = None

        while True:
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token
            )
            response = request.execute()

            video_ids = []
            for item in response['items']:
                video_id = item['contentDetails']['videoId']
                video_ids.append(video_id)

            request1 = youtube.videos().list(
                part="snippet, statistics, contentDetails",
                id=','.join(video_ids),
                maxResults=50
            )
            response1 = request1.execute()

            for item in response1['items']:
                video_id = item['id']
                video_name = item['snippet']['title']
                video_description = item['snippet']['description']
                tags = item['snippet'].get('tags', [])
                published_at = item['snippet']['publishedAt']
                view_count = item['statistics'].get('viewCount', 0)
                like_count = item['statistics'].get('likeCount', 0)
                favorite_count = item['statistics'].get('favoriteCount', 0)
                comment_count = item['statistics'].get('commentCount', 0)
                duration = item['contentDetails']['duration']
                thumbnail = item['snippet']['thumbnails']['default']['url']
                caption_status = item['contentDetails'].get('caption', False)

                # Convert published_at string to datetime object
                published_at = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")

                # Convert duration format to "hr:min:sec"
                pattern = re.compile(r'PT(\d+H)?(\d+M)?(\d+S)?')
                match = pattern.match(duration)
                if match:
                    hours = int(match.group(1)[:-1]) if match.group(1) else 0
                    minutes = int(match.group(2)[:-1]) if match.group(2) else 0
                    seconds = int(match.group(3)[:-1]) if match.group(3) else 0
                    duration = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
                else:
                    duration = "00:00:00"
                
                
                try:
                    request2 = youtube.commentThreads().list(
                        part="snippet",
                        videoId=video_id,
                        maxResults=50
                    )
                    response2 = request2.execute()

                    comments = {}
                    for comment_item in response2['items']:
                        comment_id = comment_item['id']
                        comment_text = comment_item['snippet']['topLevelComment']['snippet']['textDisplay']
                        comment_author = comment_item['snippet']['topLevelComment']['snippet']['authorDisplayName']
                        comment_published_at = comment_item['snippet']['topLevelComment']['snippet']['publishedAt']

                        # Convert comment_published_at string to datetime object
                        comment_published_at = datetime.strptime(comment_published_at, "%Y-%m-%dT%H:%M:%SZ")

                        comment = {
                            'comment_id': comment_id,
                            'comment_text': comment_text,
                            'comment_author': comment_author,
                            'comment_published_at': comment_published_at
                        }
                        comments[comment_id] = comment

                except HttpError as e:
                    if 'commentsDisabled' in str(e):
                        comments = {}  # Set empty comments if comments are disabled
                    else:
                        raise  # Re-raise the exception if it's not a "commentsDisabled" error

                video_data[video_id] = {
                    'video_id': video_id,
                    'video_name': video_name,
                    'video_description': video_description,
                    'tags': tags,
                    'published_at': published_at,
                    'view_count': view_count,
                    'like_count': like_count,
                    'favorite_count': favorite_count,
                    'comment_count': comment_count,
                    'duration': duration,
                    'thumbnail': thumbnail,
                    'caption_status': caption_status,
                    'comments': comments
                }

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        return video_data

    except HttpError as e:
        print(f"An error occurred: {e}")
        return None


def store_youtube_data(channel_id, channel_data, video_data):
    client = MongoClient('mongodb://localhost:27017/')
    db = client['youtube_data_lake']

    existing_channel = db.channels.find_one({'channel_id': channel_id})
    if existing_channel:
        print("Channel data already exists in MongoDB.")
        return

    channel_data['channel_id'] = channel_id
    channel_data['videos'] = video_data

    db.channels.insert_one(channel_data)
    print("Data stored in MongoDB successfully!")


def main(channel_id, api_key):
    youtube = service(api_key)
    channel_data = get_channel_data(youtube, channel_id)
    if channel_data:
        playlist_id = channel_data['playlist_id']
        video_data = vc_data(youtube, playlist_id)
        if video_data:
            store_youtube_data(channel_id, channel_data, video_data)


api_key = 'AIzaSyCRK_VITtlgN4odgCAwW5g2sdIChbXVbVY'
channel_id = input('Enter input: ')
main(channel_id, api_key)

Enter input: UCAKknX9QPzMG-PPgcyBURXg
Channel data already exists in MongoDB.
