In [1]:
# Importing packages needed to access and authenticate Youtube Data API

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

import urllib.request

import urllib.parse as p
import re
import os
import pickle

SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

In [2]:
#!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib==0.4.6

Collecting google-api-python-client
  Downloading google_api_python_client-2.83.0-py2.py3-none-any.whl (11.2 MB)
     -------------------------------------- 11.2/11.2 MB 107.8 kB/s eta 0:00:00
Collecting google-auth-httplib2
  Using cached google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
Collecting google-auth-oauthlib==0.4.6
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting google-auth>=1.0.0
  Downloading google_auth-2.17.1-py2.py3-none-any.whl (178 kB)
     ------------------------------------ 178.1/178.1 kB 136.1 kB/s eta 0:00:00
Collecting requests-oauthlib>=0.7.0
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting uritemplate<5,>=3.0.1
  Using cached uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Using cached google_api_core-2.11.0-py3-none-any.whl (120 kB)
Collecting httplib2<1dev,>=0.15.0
  Downloading httplib2-0.22.0-py3-none-any.whl

In [2]:
# The following function is used to authenticate with the google cloud and gain permission and access
# to the Youtube DATA API

def youtube_authenticate():
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service = "youtube"
    api_version = "v3"
    client_secrets_file = "YT_API_Credentials.json"
    creds = None
    
    # A file named token.pickle shall be automatically created in the folder which stores the user's access and refresh 
    # tokens created automatically when the authorization flow completes for the first time
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)
    
    # If there are no (valid) credentials available, allow the user to log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, SCOPES)
            creds = flow.run_local_server(port=0)
        # save the credentials for the next run
        with open("token.pickle", "wb") as token:
            pickle.dump(creds, token)

    return build(api_service, api_version, credentials=creds)

# Authenticate with YouTube API
youtube = youtube_authenticate()

In [4]:
# The following function will help us extract the youtube video ID from the video URL

def get_video_id_by_url(url):
    """
    Return the Video ID from the video `url`
    """
    # split URL parts
    parsed_url = p.urlparse(url)
    # get the video ID by parsing the query of the URL
    video_id = p.parse_qs(parsed_url.query).get("v")
    if video_id:
        return video_id[0]
    else:
        raise Exception(f"Unable to parse video URL: {url}")

In [5]:
# The below function recieves a YouTube service object (returned from youtube_authenticate() function) 
# and a keyword argument accepted by the API. It then returns the API response for a specific video
def get_video_details(youtube, **kwargs):
    return youtube.videos().list(
        part="snippet,contentDetails,statistics",
        **kwargs
    ).execute()

In [6]:
# A function that takes a response returned from the above get_video_details() function
# and returns the video title
def get_video_title(video_response):
    items = video_response.get("items")[0]
    snippet = items["snippet"]
    #title         = snippet["title"]

    # Replace '|' with '-' to prevent errors in saving files
    #title         = snippet["title"].replace("|","-")
    title = [idx for idx in snippet["title"].replace("|","-") if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", idx)]
    title1 = ''.join(title).replace("/", "")
    title1 = title1.replace("?", "")
    title1 = title1.replace('"', '')
    title1 = title1.replace(':', '')
    return title1

In [7]:
# A function that takes a response returned from the above get_video_details() function
# and returns the channel title
def get_channel_title(video_response):
    items = video_response.get("items")[0]
    snippet = items["snippet"]
    
    channel_title = snippet["channelTitle"]
    return channel_title

In [8]:
# A function that takes a response returned from the above get_video_details() function
# and returns the video description
def get_description(video_response):
    items = video_response.get("items")[0]
    snippet = items["snippet"]
    
    description   = snippet["description"]
    return description

In [9]:
# A function that takes a response returned from the above get_video_details() function
# and returns the number of comments on the video
def get_comment_count(video_response):
    items = video_response.get("items")[0]
    statistics = items["statistics"]
    
    comment_count = statistics["commentCount"]
    return comment_count

In [10]:
# A function that takes a response returned from the above get_video_details() function
# and returns the number of views the video has
def get_view_count(video_response):
    items = video_response.get("items")[0]
    statistics = items["statistics"]
    
    view_count = statistics["viewCount"]
    return view_count

In [11]:
# A function that takes a response returned from the above get_video_details() function
# and returns the duration of the video
def get_duration(video_response):
    items = video_response.get("items")[0]
    content_details = items["contentDetails"]

    duration = content_details["duration"]
    # Duration provided by API has a format similar to - 'PT5H50M15S'
    
    # Parsing it below to convert it into a format similar to - '5:50:15'
    parsed_duration = re.search(f"PT(\d+H)?(\d+M)?(\d+S)", duration).groups()
    duration_str = ""
    for d in parsed_duration:
        if d:
            duration_str += f"{d[:-1]}:"
    duration_str = duration_str.strip(":")
    
    return duration_str

In [12]:
# Code to scrape video urls from youtube trending page to be added here. 
# Depending on Google Vision API, either all trending videos shall be extracted together
# Or trending videos shall be extracted for each category seperately
# These scraped urls can be used using the code blocks below to get video details and thumbnails

web_page_url = 'https://www.youtube.com/feed/trending'

fid = urllib.request.urlopen(web_page_url)

webpage=fid.read().decode('utf-8')

# This shall contain all urls for trending videos
url_df = []

for line in webpage.split('"'):
    if '/watch?v' in line:
        if len(line) ==20:
            url_df.append("https://www.youtube.com"+line)

In [13]:
# Code segment to download video thumbnail

import requests # Used to request image from the website
import shutil # Used to save image locally
import os

if not os.path.exists("thumbnail_data"):
    os.makedirs("thumbnail_data")

# Set the https format for youtube thumbnails
thumb_address = "https://img.youtube.com/vi/"
thumb_format = "/0.jpg"

# Next step - get website links from url_df and loop the following code. Maybe make use of multi threading for downloads
jj=0
for i in url_df:
    video_url = i
    # parse video ID from URL
    video_id = get_video_id_by_url(video_url)
    # make API call to get video info
    response = get_video_details(youtube, id=video_id)

    thumbnail_url = thumb_address+video_id+thumb_format

    video_title = get_video_title(response)
    file_name = 'thumbnail_data/{}-{}.jpg'.format(jj, video_title)
    res = requests.get(thumbnail_url, stream = True)

    if res.status_code == 200:
        with open(file_name,'wb') as f:
            shutil.copyfileobj(res.raw, f)
        print('Image sucessfully Downloaded: ',file_name.format())
    else:
        print('Image could not be Downloaded')
    
    jj=jj+1

Image sucessfully Downloaded:  thumbnail_data/0-JR PASS GUIDE & UPDATES! March 2023.jpg
Image sucessfully Downloaded:  thumbnail_data/1-TOP NEW HIDDEN PLACES IN TOKYO 2023! (part 2).jpg
Image sucessfully Downloaded:  thumbnail_data/2-JAPAN TRAVEL UPDATE 2023 MARCH & New places in TOKYO.jpg
Image sucessfully Downloaded:  thumbnail_data/3-I Got Attacked By A Heavyweight Champion.jpg
Image sucessfully Downloaded:  thumbnail_data/4-SORRY NOT SORRY.jpg
Image sucessfully Downloaded:  thumbnail_data/5-The Legend of Zelda: Tears of the Kingdom  Mr. Aonuma Gameplay Demonstration.jpg
Image sucessfully Downloaded:  thumbnail_data/6-Ponniyin Selvan Part-2 Trailer | Tamil | Mani Ratnam | AR Rahman |Subaskaran |Madras Talkies |Lyca.jpg
Image sucessfully Downloaded:  thumbnail_data/7-F1 Car vs Outback | Daniel Ricciardo's Great Aussie Road Trip.jpg
Image sucessfully Downloaded:  thumbnail_data/8-JISOO - (FLOWER) MV TEASER.jpg
Image sucessfully Downloaded:  thumbnail_data/9-Russia is getting hammered:

In [4]:
list1 = [i for i in range(96)]
len(list1)

96

In [5]:
jj1=0
for i in list1:
    jj1+=1
print(jj1)

96


In [9]:
print((jj1*100)/96)

100.0
