# Assignment 21 - Feb 22' 23 - YouTube Video Data Scraping

## Go to this given URL and solve the following questions.
> URL: https://www.youtube.com/@PW-Foundation/videos

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import logging
import os
from pandas import DataFrame
import json
from json import loads

In [2]:
url = "https://www.youtube.com/@PW-Foundation/videos"

# Get the html by get method
response = requests.get(url)
response    # use - response.text - to see the html extracted by get method

<Response [200]>

In [3]:
# Create BeautifulSoup object
soup = BeautifulSoup(response.text, "html.parser")  # to see parsed html use - soup

In [4]:
all_script_tags = soup.findAll("script")

In [5]:
def script_tag_to_json(tags: list) -> dict:
    for tag in reversed(tags):
        text: str = tag.text
        if 'ytInitialData = {"responseContext"' in text:
            return json.loads(text[20:-1])
    
    raise ValueError("Required script tag not found in the given tags.")

In [6]:
data = script_tag_to_json(all_script_tags)

In [7]:
def get_contents_dict(data):
    return data['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']

### Q1. Write a python program to extract the video URL of the first five videos.

#### Get Video ID

In [8]:
def get_videoUrl(data: dict, n: int = 5):
    contents = get_contents_dict(data)
    
    if n > 30:
        raise ValueError("Max limit is 30.")
        
    result = []
    for i in range(n):
        video_id = contents[i]['richItemRenderer']['content']['videoRenderer']['videoId']
        result.append("https://www.youtube.com/watch?v=" + video_id)
        
    return result

get_videoUrl(data)

['https://www.youtube.com/watch?v=ZNHlCezYN1I',
 'https://www.youtube.com/watch?v=nNmV-fuGW5c',
 'https://www.youtube.com/watch?v=6Mht7UigC_w',
 'https://www.youtube.com/watch?v=WObRrneLVRY',
 'https://www.youtube.com/watch?v=nX5ONgCdLcc']

### Q2. Write a python program to extract the URL of the video thumbnails of the first five videos.

#### Get video thumbnails

In [9]:
def get_thumbnails(data: dict, n: int = 5):
    contents = get_contents_dict(data)
    
    if n > 30:
        raise ValueError("Max limit is 30.")
    
    result = []
    for i in range(n):
        thumbnail_url = contents[i]['richItemRenderer']['content']['videoRenderer']['thumbnail']['thumbnails'][-1]['url']
        result.append(thumbnail_url)
        
    return result

get_thumbnails(data)

['https://i.ytimg.com/vi/ZNHlCezYN1I/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLCeC-GzAbJasTeW24Z9mSnL8PUV0w',
 'https://i.ytimg.com/vi/nNmV-fuGW5c/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLDrH1kaO1h12A7THP8j1zoeswW85w',
 'https://i.ytimg.com/vi/6Mht7UigC_w/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLDT7e58gn2v84CHXh5IwWB6rXof8A',
 'https://i.ytimg.com/vi/WObRrneLVRY/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLB22wdxfGJ8qHomPM8TcAMwI8FDRA',
 'https://i.ytimg.com/vi/nX5ONgCdLcc/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLA-CsQcSMRiaxzuJVMn37GAUFS6XQ']

### Q3. Write a python program to extract the title of the first five videos.

#### Get video title

In [10]:
def get_title(data: dict, n:int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')
    
    result = []
    for i in range(5):
        video_title = contents[i]['richItemRenderer']['content']['videoRenderer']['title']['runs'][-1]['text']
        result.append(video_title)
        
    return result

get_title(data)

['Revise through PYQs || Electricity || #science #physics',
 'Revise through PYQs || Light || #science #physics',
 'NEW Batches for Class 10 & 9 - Session 2023-24 || NEEV and UDAAN Batch Launch 🚀',
 'आपको Commerce में क्या सीखने को मिलता है ?? Complete Information',
 'How to Attempt English Board Exam ????']

### Q4. Write a python program to extract the number of views of the first five videos.

#### Get video viwes

In [11]:
def get_viwes(data: dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        video_views = contents[i]['richItemRenderer']['content']['videoRenderer']['viewCountText']['simpleText'][:-6].replace("," , "")
        result.append(video_views)
        
    return result

get_viwes(data)

['4835', '26778', '34138', '11201', '184902']

### Q5. Write a python program to extract the time of posting of video for the first five videos.

#### Get time of posting of video

In [12]:
def get_time_of_posting(data: dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')
        
    result = []
    for i in range(n):
        video_time_of_posting = contents[i]['richItemRenderer']['content']['videoRenderer']['publishedTimeText']['simpleText']
        result.append(video_time_of_posting)
        
    return result

get_time_of_posting(data)

['1 hour ago', '6 hours ago', '8 hours ago', '2 days ago', '6 days ago']

## Note: Save all the data scraped in the above questions in a CSV file.

### Save data in CSV format.

In [13]:
def get_channel_video_details(data: dict, n: int):
    
    titles = get_title(data, n)
    time_of_posting = get_time_of_posting(data, n)
    views = get_viwes(data, n)
    video_urls = get_videoUrl(data, n)
    thumbnails = get_thumbnails(data, n)
    
    main_data = list(zip(titles, time_of_posting, views, video_urls, thumbnails))
    
    df = DataFrame.from_dict(main_data)
    df.rename(
        columns={
            0: 'title',
            1: 'time_of_posting',
            2: 'views',
            3: 'video_urls',
            4: 'thumbnail_url'
        }, inplace=True)

    return df

In [14]:
channel_data = get_channel_video_details(data, 10)

In [15]:
channel_data.to_csv('PW-Foundation-Video-Details.csv', index=False)