### Import Libraries

In [18]:
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

### Setting-up Selenium

In [19]:
# !pip install selenium
# !pip install chromedriver_binary


### Connecting Web Driver

In [20]:
browser = webdriver.Edge()
browser.get("https://www.youtube.com/c/GeeksforGeeksVideos/videos")

In [21]:
soup = BeautifulSoup(browser.page_source, "html.parser")

In [22]:
time.sleep(3)
pbar = tqdm(desc = "Processing")

# Scroll until no new videos are loaded
last_count = 0
while True:
    # Scroll down to the bottom of the page
    browser.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
    time.sleep(1)  # Wait for new content to load
    
    # Parse the page source and count the video elements
    soup = BeautifulSoup(browser.page_source, "html.parser")
    video_elements = soup.find_all("ytd-rich-grid-media")
    new_count = len(video_elements)
    
    # If no new videos are loaded, break the loop
    if new_count == last_count:
        break
        
    last_count = new_count
    pbar.update(1)
    
pbar.close()

Processing: 21it [04:49, 13.77s/it]


### Creating Data Frame

In [23]:
curr_date = str(datetime.now()).split()[0]
data = []

for sp in tqdm(soup.find_all("ytd-rich-grid-media"), desc = "Processing"):
    video_link = "https://www.youtube.com" + sp.find("a", class_ = "yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media").get("href")
    
    title = sp.find("a", class_ = "yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media").text
    
    try:
        thumbnail_link = sp.find('img').get("src").split("?")[0]
    except:
        thumbnail_link = np.nan
        
    try:
        duration = sp.find('span', class_ = "style-scope ytd-thumbnail-overlay-time-status-renderer").text.strip()
    except:
        duration = np.nan
        
    try:
        views = sp.find_all('span', class_ = "inline-metadata-item style-scope ytd-video-meta-block")[0].text
    except:
        views = np.nan
    
    try:
        date_time = sp.find_all('span', class_ = "inline-metadata-item style-scope ytd-video-meta-block")[1].text
    except:
        date_time = np.nan
        
    data.append([title, video_link, thumbnail_link, duration, views, date_time, curr_date])
    
print(len(data))

Processing: 100%|█████████████████████████████████████████████████████████████████| 1906/1906 [00:02<00:00, 883.66it/s]

1906





In [24]:
df = pd.DataFrame(data, columns = ['title', 'video_link', 'image_link', 'duration', 'views', 'upload_time', "date_of_extraction"])

In [25]:
df.isnull().sum()

title                   0
video_link              0
image_link            426
duration                0
views                   0
upload_time             0
date_of_extraction      0
dtype: int64

In [27]:
df.describe()

Unnamed: 0,title,video_link,image_link,duration,views,upload_time,date_of_extraction
count,1906,1906,1480,1906,1906,1906,1906
unique,1897,1906,1480,838,385,26,1
top,Sort the given string using character search |...,https://www.youtube.com/watch?v=puDKP5_0Ol4,https://i.ytimg.com/vi/puDKP5_0Ol4/hqdefault.jpg,3:39,12K views,6 years ago,2024-11-16
freq,2,1,1,11,37,403,1906


### Saving Dataset in CSV Format

In [28]:
df.to_csv("youtube_data.csv")

### Closing Browser Session

In [29]:
browser.quit()

Unnamed: 0,title,video_link,image_link,duration,views,upload_time,date_of_extraction
0,GfG 160 | 160 Days Daily DSA Problem Solving |...,https://www.youtube.com/watch?v=puDKP5_0Ol4,https://i.ytimg.com/vi/puDKP5_0Ol4/hqdefault.jpg,1:06,669 views,7 hours ago,2024-11-16
1,From Tier 3 College to Qualcomm as a Software ...,https://www.youtube.com/watch?v=1TnVdzwbzJo,https://i.ytimg.com/vi/1TnVdzwbzJo/hqdefault.jpg,19:15,678 views,1 day ago,2024-11-16
2,Why you will not get placed? | Save yourself Now,https://www.youtube.com/watch?v=b-6rmk233DU,https://i.ytimg.com/vi/b-6rmk233DU/hqdefault.jpg,7:00,1.7K views,4 days ago,2024-11-16
3,Y.O.G.I. | Your Own GeeksforGeeks Intelligence...,https://www.youtube.com/watch?v=rfyhmQYEJKE,https://i.ytimg.com/vi/rfyhmQYEJKE/hqdefault.jpg,0:42,856 views,4 days ago,2024-11-16
4,GeeksforGeeks Offline Classes- Visit for Free ...,https://www.youtube.com/watch?v=hDfjzq9m5vI,https://i.ytimg.com/vi/hDfjzq9m5vI/hqdefault.jpg,0:57,49K views,9 days ago,2024-11-16
...,...,...,...,...,...,...,...
1901,Length of shortest chain to reach a target wor...,https://www.youtube.com/watch?v=6pIC20wCm20,https://i.ytimg.com/vi/6pIC20wCm20/hqdefault.jpg,12:25,43K views,8 years ago,2024-11-16
1902,Binary Search | GeeksQuiz,https://www.youtube.com/watch?v=T2sFYY-fT5o,https://i.ytimg.com/vi/T2sFYY-fT5o/hqdefault.jpg,7:33,191K views,8 years ago,2024-11-16
1903,Number of Triangles in an Undirected Graph | G...,https://www.youtube.com/watch?v=ChdNz1Ui1uc,https://i.ytimg.com/vi/ChdNz1Ui1uc/hqdefault.jpg,8:44,19K views,8 years ago,2024-11-16
1904,Write a program to print all permutations of a...,https://www.youtube.com/watch?v=AfxHGNRtFac,https://i.ytimg.com/vi/AfxHGNRtFac/hqdefault.jpg,11:52,491K views,8 years ago,2024-11-16
