In [None]:
!pip install google-api-python-client

In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import pandas as pd
from IPython.display import JSON


In [None]:
api_key = 'Your API Key'

In [None]:
channel_ids = ['UCk3JZr7eS3pg5AGEvBdEvFg', # Village cooking channel
               'UCiPmhfdCL06cSVTXKabF0Zg', # Nakkalites
               'UC5EQWvy59VeHPJz8mDALPxg', # Micset
               'UCY6KjrDBN_tIRFT_QNqQbRQ', # Madan gowri
               'UCHGktfcQq2BY_8tGPHwvm7g'  # Madras Samayal
              ]

In [None]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [None]:
# Get channel stats using channel_ids
def get_channel_stats(youtube, channel_ids):

    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id= ','.join(channel_ids)
    )
    response = request.execute()
    
    #loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        all_data.append(data)

    return(pd.DataFrame(all_data)) 


In [None]:
channel_stats = get_channel_stats(youtube, channel_ids)
channel_stats.head(5)

In [None]:
# get video ids using playlist ids of each channel
def get_video_ids(youtube, playlist_id):
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

In [None]:
# combining all the video ids together
combined_video_ids = []
for playlist_id in channel_stats['playlistId']:
    video_ids = get_video_ids(youtube, playlist_id)
    combined_video_ids.extend(video_ids)

In [None]:
len(combined_video_ids)

In [None]:
# get the video details using video ids
def get_video_details(youtube,combined_video_ids):

    all_video_info = []

    for i in range(0,len(combined_video_ids), 50):
        request = youtube.videos().list(
            part = "snippet,contentDetails,statistics",
            id = ','.join(combined_video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet':['channelTitle', 'title',  'publishedAt', 'tags', 'description'],
                             'statistics': ['viewCount', 'likeCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None
                    
            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [None]:
video_df = get_video_details(youtube,combined_video_ids)

In [None]:
video_df.head(5)

In [None]:
def get_comments_in_videos(youtube, combined_video_ids):
    all_comments = []
    
    for video_id in combined_video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments) 

In [None]:
comments_df = get_comments_in_videos(youtube, combined_video_ids)

In [None]:
comments_df.head(5)

In [None]:
video_df.head(5)

In [None]:
 video_df.to_csv("C:/Users/Vasu/Desktop/API projects/youtube_api/youtube_Video.csv", index = False)

In [None]:
 comments_df.to_csv("C:/Users/Vasu/Desktop/API projects/youtube_api/youtube_comments.csv", index = False)

In [None]:
video_df.isnull().any()

In [None]:
video_df.dtypes

In [None]:
# converting multiole columns to numeric
cols_to_convert = ['viewCount', 'likeCount', 'commentCount']
video_df[cols_to_convert] = video_df[cols_to_convert].apply(pd.to_numeric)

In [None]:
video_df.dtypes

In [None]:
# convert publishedAt to datetime
video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt'])

In [None]:
# extract date, time, and day
video_df['date'] = video_df['publishedAt'].dt.date
video_df['time'] = video_df['publishedAt'].dt.time
video_df['day'] = video_df['publishedAt'].dt.day_name()

In [None]:
video_df.head(5)

In [None]:
video_df.drop(columns=['publishedAt'], inplace=True)

In [None]:
video_df.head(5)

In [None]:
!pip install isodate

In [None]:
import isodate

In [None]:
# parsing the youtube duration data into minutes and seconds using 'isodate' library
def parse_duration(duration):
    duration_parsed = isodate.parse_duration(duration)
    total_seconds = int(duration_parsed.total_seconds())
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes}m {seconds}s"  

In [None]:
# Apply the function to the 'duration' column
video_df['duration'] = video_df['duration'].apply(parse_duration)

In [None]:
video_df.head(5)

In [None]:
# calculating the number of tags
def count_tags(tags):
    if tags is None:
        return 0
    return len(tags)

In [None]:
video_df['tag_count'] = video_df['tags'].apply(count_tags)

In [None]:
video_df.head(5)

In [None]:
# extracting title length
video_df['title_length'] = video_df['title'].apply(lambda x: len(x))

In [None]:
video_df.head(5)

In [None]:
channel_stats

In [None]:
# converting multiple columns to numeric
cols_to_convert = ['subscribers', 'views', 'totalVideos']
channel_stats[cols_to_convert] = channel_stats[cols_to_convert].apply(pd.to_numeric)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
sns.set(rc={'figure.figsize': (10, 8)})
ax = sns.barplot(x = 'channelName',
            y = 'subscribers',
            data = channel_stats.sort_values('subscribers', ascending = False),
            palette = 'viridis')
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x/1000)}K'))
plot = ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)

In [None]:
ax = sns.barplot(x = 'channelName',
            y = 'views',
            data = channel_stats.sort_values('views', ascending = False),
            palette = 'viridis')
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x/1000)}K'))
plot = ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)

In [None]:
ax = sns.barplot(x = 'channelName',
            y = 'totalVideos',
            data = channel_stats.sort_values('totalVideos', ascending = False),
            palette = 'viridis')
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x/1000)}K'))
plot = ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)

In [None]:
# video_df.to_csv("C:/Users/Vasu/Desktop/API projects/youtube_api/youtube_Video_processed.csv", index = False)

In [None]:
# channel_stats.to_csv("C:/Users/Vasu/Desktop/API projects/youtube_api/channel_stats.csv", index = False)