# Web Scrapping

URL: https://www.youtube.com/@PW-Foundation/videos

In [1]:
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
from json import loads

In [2]:
url = 'https://www.youtube.com/@PW-Foundation/videos'

# Get the html by get method
r = requests.get(url)
r


<Response [200]>

In [3]:
# Create BeautifulSoup object
soup = BeautifulSoup(r.text, 'html.parser')

In [5]:
all_script_tags = soup.findAll('script')

In [6]:
def script_tag_to_json(tags: list) -> dict:
    for tag in reversed(tags):
        text: str = tag.text
        if 'ytInitialData = {"responseContext"' in text:
            return loads(text[20:-1])

    raise ValueError('Required script tag not found in the given tags.')

In [7]:
data = script_tag_to_json(all_script_tags)

In [8]:
def get_contents_dict(data):
    return data['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']

In [11]:
# Video ID
def get_videoUrl(data:dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        result.append('https://www.youtube.com/watch?v=' +
                      contents[i]['richItemRenderer']['content']['videoRenderer']['videoId'])

    return result

get_videoUrl(data)

['https://www.youtube.com/watch?v=46CNHP9wBAs',
 'https://www.youtube.com/watch?v=1Wk5gJtZ2sQ',
 'https://www.youtube.com/watch?v=iM_hVnElC-Q',
 'https://www.youtube.com/watch?v=Prly9d7LoAQ',
 'https://www.youtube.com/watch?v=AvwBDiCWSEM']

In [12]:
def get_thumbnails(data: dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        result.append(contents[i]['richItemRenderer']['content']['videoRenderer']['thumbnail']['thumbnails'][-1]['url'])

    return result

get_thumbnails(data)

['https://i.ytimg.com/vi/46CNHP9wBAs/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLBnDxav8ANiJSwQ02S72qCEqH-arw',
 'https://i.ytimg.com/vi/1Wk5gJtZ2sQ/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLANLaii89RBWim09CxflQRysxX4Iw',
 'https://i.ytimg.com/vi/iM_hVnElC-Q/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLAu89YhDS7Fsb5eXR3GKf6OsrEHxQ',
 'https://i.ytimg.com/vi/Prly9d7LoAQ/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLAxqCd_PIbtlIui7PB1_yHivrAjLQ',
 'https://i.ytimg.com/vi/AvwBDiCWSEM/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLCtUJpeO6evl8lSnyA1tu3aZYUAkA']

In [13]:
def get_title(data: dict, n:int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        result.append(contents[i]['richItemRenderer']['content']['videoRenderer']['title']['runs'][-1]['text'])

    return result

get_title(data)

['Complete 𝗔𝗖𝗜𝗗, 𝗕𝗔𝗦𝗘𝗦 𝗔𝗡𝗗 𝗦𝗔𝗟𝗧 in 110 Minutes | Class 10th Board Exam',
 'Complete 𝗖𝗛𝗘𝗠𝗜𝗖𝗔𝗟 𝗥𝗘𝗔𝗖𝗧𝗜𝗢𝗡  in 90 Minutes | Class 10th Board Exam',
 'Complete 𝐇𝐄𝐑𝐄𝐃𝐈𝐓𝐘 𝐀𝐍𝐃 𝐄𝐕𝐎𝐋𝐔𝐓𝐈𝐎𝐍 in 2 Hours|| Class 10th Board Exam',
 'Complete 𝗛𝗢𝗪 𝗗𝗢 𝗢𝗥𝗚𝗔𝗡𝗜𝗦𝗠𝗦 𝗥𝗘𝗣𝗥𝗢𝗗𝗨𝗖𝗘 in 1 Hours 50 Minutes|| Class 10th Board Exam',
 'Complete 𝗢𝗨𝗥 𝗘𝗡𝗩𝗜𝗥𝗢𝗡𝗠𝗘𝗡𝗧 in 1 Hour 30 Minutes | Class 10th Board Exam']

In [14]:
def get_viwes(data: dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        result.append(int(contents[i]['richItemRenderer']['content']['videoRenderer']['viewCountText']['simpleText']
                      [:-6].replace(',', '')))

    return result

get_viwes(data)

[7547, 17304, 42166, 19145, 34110]

In [15]:
def get_time_of_posting(data: dict, n: int = 5):
    contents = get_contents_dict(data)

    if n > 30:
        raise ValueError('Max Limit is 30.')

    result = []
    for i in range(n):
        result.append(contents[i]['richItemRenderer']['content']['videoRenderer']['publishedTimeText']['simpleText'])

    return result

get_time_of_posting(data)

['5 hours ago', '7 hours ago', '19 hours ago', '22 hours ago', '23 hours ago']

In [19]:
# Save data in CSV format.

def get_channel_video_details(data: dict, n: int):
    thumbnails = get_thumbnails(data, n)
    time_of_posting = get_time_of_posting(data, n)
    titles = get_title(data, n)
    video_urls = get_videoUrl(data, n)

    main_data = list(zip(video_urls, titles, thumbnails, time_of_posting))
    
    df = DataFrame.from_dict(main_data)
    df.rename(
        columns={
            0: 'video_urls',
            1: 'title',
            2: 'thumbnail_url',
            3: 'time_of_posting'
        }, inplace=True)

    return df

In [17]:
channel_data = get_channel_video_details(data, 30)

In [18]:
channel_data.to_csv('PW-Foundation.csv', index=False)