### Import Libraries

In [None]:
# Import libraries
from pytube import YouTube
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from pprint import pprint
from tabulate import tabulate

import speech_recognition as sr
from moviepy.editor import VideoFileClip
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from functools import partial, reduce

import pandas as pd
import json
import isodate
import numpy as np

### Get API Key - Initialize YouTube Library

In [None]:
# Define your API Key - Authorize credentials from Google Cloud Console - https://console.cloud.google.com/
# API_KEY = ''
API_KEY = ''

# Create a YouTube Data API service object
youtube = build('youtube','v3', developerKey=API_KEY)

### About Youtube Channel

In [None]:
channelId = 'UCUMZ7gohGI9HcU9VNsr2FJQ' 

about_search_request = youtube.channels().list(
        part="snippet, statistics, contentDetails",
        id = channelId
)

about_search_response = about_search_request.execute()
    
print(json.dumps(about_search_response, indent=4))

In [None]:
channels_list = []
# i = 0
for item in about_search_response['items']:
    channel_id = item['id']
    title = item['snippet']['title']
    viewCount = item['statistics']['viewCount']
    subscriberCount = item['statistics']['subscriberCount']
    videoCount = item['statistics']['videoCount']
    
    channels_list.append({
        # "S.NO": i,
        "channel_id": channel_id,
        "title": title,
        "viewCount": int(viewCount),
        "subscriberCount": int(subscriberCount), 
        "videoCount": int(videoCount), 
      # "view_count": view_count  # Might be None if data unavailable
    })

    # i = i + 1

# for i, video in enumerate(videos_list):
    # print(f"{i+1}. {video['title']} (by {video['description']})")

channels_list_df = pd.DataFrame(channels_list)
channels_list_df

### Search Video List Using Channel Id

In [None]:
# Search for videos based on the Channel Id

# Bloomberg Originals
channelId = 'UCUMZ7gohGI9HcU9VNsr2FJQ' 

search_request = youtube.search().list(
    part="snippet",
    maxResults = 50,
    channelId = channelId,
    type="video"
)

search_response = search_request.execute()

print(json.dumps(search_response, indent=4))


In [None]:
# Extract video information from search results
videos_list = []
# i = 0
for item in search_response['items']:
    video_id = item['id'].get('videoId')
    # video_id = item['id']['videoId']
    title = item['snippet']['title']
    channel_title = item['snippet']['channelTitle']
    description = item['snippet']['description']
    publishedAt = item['snippet']['publishedAt']
    
    videos_list.append({
        # "S.NO": i,
        "video_id": video_id,
        "title": title,
        "channel_title": channel_title,
        "description": description, 
        "publishedAt": publishedAt, 
      # "view_count": view_count  # Might be None if data unavailable
    })

    # i = i + 1

# for i, video in enumerate(videos_list):
    # print(f"{i+1}. {video['title']} (by {video['description']})")

videos_list_df = pd.DataFrame(videos_list)
videos_list_df



### Extract Video Ids From Searched Videos

In [None]:
video_ids = [items['video_id'] for items in videos_list]
print(video_ids)

In [None]:
# Search duration of videos

content_details_request = youtube.videos().list(
    part="contentDetails",
    maxResults = 50,
    id = ','.join(video_ids)
)

content_details_response = content_details_request.execute()

print(json.dumps(content_details_response, indent=4))


In [None]:
# Extract video information from search results
videos_duration_list = []
i = 0
for item in content_details_response['items']:
    video_id = item['id']
    duration = isodate.parse_duration(item['contentDetails']['duration']).total_seconds()
    dimension = item['contentDetails']['dimension']
    definition = item['contentDetails']['definition']
    projection = item['contentDetails']['projection']
    
    videos_duration_list.append({
        # "S.NO": i,
        "video_id": video_id,
        "duration (sec)": duration,
        "dimension": dimension,
        "definition": definition, 
        "projection": projection, 
      # "view_count": view_count  # Might be None if data unavailable
    })

    # i = i + 1

# for i, video in enumerate(videos_list):
    # print(f"{i+1}. {video['title']} (by {video['description']})")

videos_duration_list_df = pd.DataFrame(videos_duration_list)
videos_duration_list_df



In [None]:
# Search stats of videos

stats_request = youtube.videos().list(
    part = 'statistics',
    maxResults = 50,
    id = ','.join(video_ids)
)

stats_response = stats_request.execute()

print(json.dumps(stats_response, indent=4))

In [None]:
videos_stats_list = []

for item in stats_response['items']:
    videos_id = item['id']
    view_count = item['statistics']['viewCount']
    like_count = item['statistics']['likeCount']
    favorite_count = item['statistics'].get('favoriteCount')
    comment_count = item['statistics'].get('commentCount')
    # view_count = item.get('snippet', {}).get('thumbnails', {}).get('default', {}).get('viewCount', None)  # Handle potential missing data

    videos_stats_list.append({
        "video_id": videos_id,
        "view_count": view_count,
        "like_count": like_count, 
        "favorite_count": favorite_count, 
        "comment_count": comment_count, 
    })

videos_stats_list_df = pd.DataFrame(videos_stats_list)
videos_stats_list_df



### Sentiment Analysis - Extracting Comments

In [None]:
# Get comments functions using video id

def get_comments(video_id):
    videos_comments_list = []
    try:
        comments_request = youtube.commentThreads().list(
            part='snippet',
            videoId = video_id,
            textFormat='plainText'
        )

        comments_response = comments_request.execute()
        # print(json.dumps(comments_response, indent=4))
        
        for comment in comments_response['items']:
            video_id = comment['snippet']['topLevelComment']['snippet']['videoId']
            text = comment['snippet']['topLevelComment']['snippet']['textDisplay']

            videos_comments_list.append({
                "video_id": video_id,
                "text": text,
            })
    
        return videos_comments_list
        
    except HttpError as e:
        print(f'Error fetching comments for Video ID {video_id}: {e}')
        videos_comments_list.append({
                "video_id": video_id,
                "text": 'Disabled Comments',
            })
        return videos_comments_list


In [None]:
# Fetch comments for each video ID
all_comments = {}
for video_id in video_ids:
    comments = get_comments(video_id)
    all_comments[video_id] = comments

comments_list = []

for video_id, comments in all_comments.items():
    # print(f'Comments for Video ID {video_id}:')
    if comments:
        for comment_text in comments:
            # print(comment_text['text'])
            comments_list.append({
                "video_id": video_id,
                "comment": comment_text['text'],
            })
    else:
        # print('No comments found.')
        comments_list.append({
            "video_id": video_id,
            "comment": 'No comments',
        })
    # print()

comments_list_df = pd.DataFrame(comments_list)
comments_list_df


In [None]:
text = ' '.join(comments_list_df['comment'])
text

In [None]:
# Textblob
blob = TextBlob(text)
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

In [None]:
# Determine sentiment label
if polarity>0:
    sentiment_label='positive'
elif polarity<0:
    sentiment_label='negative'
else:
    sentiment_label='neutral'

In [None]:
# Create a bar plot
plt.figure(figsize=(8,5))
plt.bar(['polarity','Subjectivity'],[polarity,subjectivity],color=['green','blue'])
plt.title('Sentiment analysis')
plt.ylabel('Score')
#plt.ylim(-1,1)
plt.xticks(rotation=45)
plt.grid(axis='y',linestyle='--',alpha=0.7)
plt.tight_layout()
plt.show()

print(f"Polarity: {polarity:.2f} (Label: {sentiment_label})")
print(f"Subjectivity: {subjectivity:.2f}")

In [None]:
stopwords = set(STOPWORDS)
new_words = ['comments','really','love','want','much','will', 'now', 'make', 'lol', 'und', 'thing', 't', 'even', 'still', 'u', 's']
new_stopwords = stopwords.union(new_words)

In [None]:
# Wordcloud
wordcloud = WordCloud(max_words = 50, stopwords = new_stopwords, width=800, height=400, background_color="white").generate(text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud Example")
plt.show()

In [None]:
# url = "https://www.youtube.com/watch?v=Fpn1imb9qZg&ab_channel=Coldplay"
# yt = YouTube(url)
# yt.streams.get_by_resolution('2160p')
# yt.streams.first().download()
# print("Download Complete")

### Concatenating All the Datasets Togeter

In [None]:
# videos_list_df
# videos_stats_list_df
# videos_duration_list_df

data_frames = [videos_list_df, videos_stats_list_df, videos_duration_list_df]
df_merged = reduce(lambda  left,right: pd.merge(left, right, on=['video_id'], how='left'), data_frames)
df_merged

In [None]:
# Sorting by column "view_count" to get Top 5 Videos
df_merged = df_merged.sort_values(by=['view_count'], ascending=False)
top_5_videos_df = df_merged[['title','description','view_count','like_count','comment_count']].head(5)
top_5_videos_df.to_csv("top_5_videos_df.csv", index=False)
top_5_videos_df

### Visualizing The Data

#### Correlation Analysis

In [None]:
plt.figure(figsize=(10,5))
df_merged['publishedAt'] = pd.to_datetime(df_merged['publishedAt'])
df_merged['publishedAt'] = pd.to_numeric(df_merged['publishedAt'])
mask = np.triu(np.ones_like(df_merged[['duration (sec)', 'view_count', 'like_count', 'comment_count', 'publishedAt']].corr(),dtype=bool))
# sns.heatmap(df_merged[['duration (sec)', 'view_count']].corr(), annot=True, fmt='.2f',cmap='BrBG')
sns.heatmap(df_merged[['duration (sec)', 'view_count', 'like_count', 'comment_count', 'publishedAt']].corr(), annot=True, mask=mask, fmt='.2f',cmap='BrBG')
plt.show()

#### Descriptive Statistics

In [None]:
# Distribution of Bloomberg Originals Video Duration Using Histogram

plt.figure(figsize=(8,5))
plt.hist(df_merged['duration (sec)'], edgecolor="black")

plt.title('Bloomberg Originals Video Duration Distribution', size=14, fontweight="bold")
plt.xlabel('Duration', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.grid(True)
# plt.savefig('Tesla stock price histogram.pdf')
plt.show()

In [None]:
# Distribution Bloomberg Originals View Count Using Boxplot

df_merged['view_count'] = pd.to_numeric(df_merged['view_count'])

plt.figure(figsize=(8,5))
sns.boxplot(data=df_merged, x="view_count", whis=(0, 100))
plt.title('Bloomberg Originals View Count Boxplot', size=14, fontweight="bold")
plt.xlabel('View Count (M)', fontsize=10)
plt.grid(True)
plt.show()

In [None]:
# Distribution Bloomberg Originals Like Count Using Boxplot

df_merged['like_count'] = pd.to_numeric(df_merged['like_count'])

plt.figure(figsize=(8,5))
sns.boxplot(data=df_merged, x="like_count", whis=(0, 100))
plt.title('Bloomberg Originals Like Count Boxplot', size=14, fontweight="bold")
plt.xlabel('Like Count', fontsize=10)
plt.grid(True)
plt.show()

### Correlation Between Like Count, View Count & Duration of Video

In [None]:
# Duration vs Like Count Using Scatter plot

df_merged['like_count'] = pd.to_numeric(df_merged['like_count'])

plt.figure(figsize=(8,5))
sns.scatterplot(data=df_merged, x="duration (sec)", y="like_count", palette="deep", sizes=(20, 200), legend="full")
plt.title('Duration vs Like Count', size=14, fontweight="bold")
plt.xlabel('Duration (sec)', fontsize=10)
plt.ylabel('Like Count', fontsize=10)
plt.grid(True)
plt.show()

In [None]:
# Duration vs View Count Using Scatter plot

df_merged['view_count'] = pd.to_numeric(df_merged['view_count'])

plt.figure(figsize=(8,5))
sns.scatterplot(data=df_merged, x="duration (sec)", y="view_count", palette="deep", sizes=(20, 200), legend="full")
plt.title('Duration vs View Count', size=14, fontweight="bold")
plt.xlabel('Duration (sec)', fontsize=10)
plt.ylabel('View Count', fontsize=10)
plt.grid(True)
plt.show()