### YouTube Channel Name & Link Scraper

This script pulls channel names and links from a YouTube search and checks them against an existing list in `Influencers.xlsx`. It skips any repeats, and only the new entries are saved to a new Excel file.

**Edit `search_url` as required.**

---

### Requirements

- `selenium`
- `pandas`
- `openpyxl`
- ChromeDriver (must match your Chrome version)
on)


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font
from openpyxl.worksheet.hyperlink import Hyperlink

#Edit as needed
search_url = "https://www.youtube.com/results?search_query=baby+routine+vlog+india"
output_excel= "channels_with_links.xlsx"

# Load entries from existing dataset
df = pd.read_excel("Influencers.xlsx", usecols="A")
known_channels = set(df.iloc[:, 0].dropna().astype(str).str.strip())

# Setup Chrome headless
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

driver.get(search_url)
time.sleep(5)

# Scroll until no more new content
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
    time.sleep(3)
    current_height = driver.execute_script("return document.documentElement.scrollHeight")
    if current_height == last_height:
        break
    last_height = current_height

# Extract channel names and links
channel_elements = driver.find_elements(By.CSS_SELECTOR, "ytd-channel-name a.yt-simple-endpoint")
channels = []
for elem in channel_elements:
    name = elem.text.strip()
    link = elem.get_attribute("href")
    if name:
        channels.append({"Channel Name": name, "Channel Link": link})

driver.quit()

#Removed any channels which are also present in influencers.xlsx and also duplicated from search result query
df_channels = pd.DataFrame(channels)
df_channels = df_channels.drop_duplicates(subset=["Channel Name"])
df_channels["Channel Name"] = df_channels["Channel Name"].astype(str).str.strip()
new_channels = df_channels[~df_channels["Channel Name"].isin(known_channels)].reset_index(drop=True)
new_channels.to_excel(output_excel, index=False)

print(f"Total scraped: {len(df_channels)}")
print(f"Removed known: {len(df_channels) - len(new_channels)}")
print(f"New unique channels: {len(new_channels)}")
new_channels.head()

# Fix excel hyperlinks not working
wb = load_workbook(output_excel)
ws = wb.active

# Assuming headers in first row: "Channel Name" in A, "Channel Link" in B
for row in range(2, ws.max_row + 1):
    link = ws[f"B{row}"].value
    if link:
        ws[f"B{row}"].hyperlink = link
        ws[f"B{row}"].font = Font(color="0000EE", underline="single")

wb.save(output_excel)
new_channels.head()

Total scraped: 407
Removed known: 7
New unique channels: 400


Unnamed: 0,Channel Name,Channel Link
0,Anu Choudhary Vlogs,https://www.youtube.com/@anu_choudhary_vlogs
1,Sindhu Family,https://www.youtube.com/@SindhuFamily
2,Simran & Gaurav Vlogs,https://www.youtube.com/@simransingh20029
3,Mom Com India,https://www.youtube.com/@MomComIndia
4,Annabell Newman,https://www.youtube.com/@AnnabellNewman



## YouTube Channel Scraper with Periodic User-Agent Rotation and Anti-Detection Measures

This script scrapes subscriber and video counts from YouTube channel pages listed 
in an input Excel file. It uses undetected_chromedriver with randomized user-agents 
to help avoid detection by anti-bot systems.

- Runs Chrome in incognito mode
- Avoids headless mode by default to reduce blocking risk.
- Implements randomized delays between requests to mimic human behavior.
- Retries loading channel pages up to 3 times if elements are not found.

### Usage:
- Run the script and provide the input Excel filename when prompted.
- Input excel should contain two columns: 'Channel Name' and 'Channel Link'.
- Provide a name for the output Excel file, where subscriber and video counts will be saved.

### Note:
- Running headless mode can increase the chance of being blocked.
- Adjust random wait times and agent rotation interval as needed.

### Dependencies:
- pandas
- selenium
- undetected_chromedriver
- fake_useragent


In [3]:
import time
import random
import pandas as pd
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

#Replace paths as required
input_excel = "channels_with_links.xlsx"
output_excel = "youtube_channel_data_enriched.xlsx"

# Function to setup ChromeDriver with fake useragent and options
def setup_driver():
    ua = UserAgent()
    options = uc.ChromeOptions()
    user_agent = ua.chrome
    options.add_argument(f'user-agent={user_agent}')
    options.add_argument('--incognito')
    
    '''
    Running in headless mode can increase the risk of being blocked by anti-bot systems.  
    If you plan to scrape more than 100 channels, it’s best to keep headless mode disabled.  
    To enable headless mode, simply remove the comment (#) from the line below.
    '''
    # options.add_argument('--headless')


    # Additional options for avoiding detection
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = uc.Chrome(options=options)
    driver.set_window_size(1200, 800)
    return driver

def random_wait(min_sec=3, max_sec=7):
    time.sleep(random.uniform(min_sec, max_sec))

def get_subscriber_and_video_counts(driver, url, max_retries=3):
    new_ua = UserAgent().chrome
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": new_ua})
    driver.get(url)
    attempt = 0
    while attempt < max_retries:
        try:
            print(f"Waiting for elements, attempt {attempt+1} on URL: {url}")
            wait = WebDriverWait(driver, 20)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.yt-content-metadata-view-model-wiz__metadata-row")))

            elements = driver.find_elements(By.CSS_SELECTOR, "div.yt-content-metadata-view-model-wiz__metadata-row span")

            subs_text = "Subscriber count not found"
            videos_text = "Video count not found"
            for elem in elements:
                text = elem.text.strip()
                if 'subscribers' in text:
                    subs_text = text
                elif 'videos' in text:
                    videos_text = text

            return subs_text, videos_text

        except Exception as e:
            print(f"Attempt {attempt+1} failed for {url}. Refreshing page and retrying...")
            driver.refresh()
            attempt += 1
            time.sleep(5)

    return "Error: Subscriber count not found after retries", "Error: Video count not found after retries"

def main(input_excel_path, output_excel_path):
    df = pd.read_excel(input_excel_path)

    driver = setup_driver()

    subscriber_counts = []
    video_counts = []
    
    # Interval for agent change (random between 20-25) (helps in avoiding detection)
    change_interval = random.randint(20, 25)
    
    for idx, row in df.iterrows():
        # Change user-agent and restart driver at interval
        if idx % change_interval == 0:
            if driver:
                driver.quit()
            ua = UserAgent()
            user_agent = ua.chrome
            print(f"Changing user-agent at iteration {idx}: {user_agent}")
            
            options = uc.ChromeOptions()
            options.add_argument(f'user-agent={user_agent}')
            options.add_argument('--incognito')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-infobars')
            options.add_argument('--disable-dev-shm-usage')
            driver = uc.Chrome(options=options)
            driver.set_window_size(1200, 800)
            
            change_interval = idx + random.randint(20, 25)
            
        channel_name = row['Channel Name']
        channel_link = row['Channel Link']

        print(f"Processing: {channel_name} - {channel_link}")
        subs, videos = get_subscriber_and_video_counts(driver, channel_link)
        print(f"Subscribers: {subs}, Videos: {videos}")
        subscriber_counts.append(subs)
        video_counts.append(videos)
        
        # Random wait to avoid detection
        random_wait(3,7)

    driver.quit()

    df['Subscriber Count'] = subscriber_counts
    df['Video Count'] = video_counts

    df.to_excel(output_excel_path, index=False)
    # Fixing excel links. (Assuming header "Channel Link" in column B)
    wb = load_workbook(output_excel_path)
    ws = wb.active
    for row in range(2, ws.max_row + 1):
        link = ws[f"B{row}"].value
        if link:
            ws[f"B{row}"].hyperlink = link
            ws[f"B{row}"].font = Font(color="0000EE", underline="single")

    wb.save(output_excel_path)
    print(f"Saved updated data to {output_excel_path}")
    
    
if __name__ == "__main__":
    main(input_excel, output_excel)


Changing user-agent at iteration 0: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36
Processing: Anu Choudhary Vlogs - https://www.youtube.com/@anu_choudhary_vlogs
Waiting for elements, attempt 1 on URL: https://www.youtube.com/@anu_choudhary_vlogs
Subscribers: 210K subscribers, Videos: 294 videos
Processing: Sindhu Family - https://www.youtube.com/@SindhuFamily
Waiting for elements, attempt 1 on URL: https://www.youtube.com/@SindhuFamily
Subscribers: 2.47K subscribers, Videos: 633 videos
Processing: Simran & Gaurav Vlogs - https://www.youtube.com/@simransingh20029
Waiting for elements, attempt 1 on URL: https://www.youtube.com/@simransingh20029
Subscribers: 14.3K subscribers, Videos: 227 videos
Processing: Mom Com India - https://www.youtube.com/@MomComIndia
Waiting for elements, attempt 1 on URL: https://www.youtube.com/@MomComIndia
Subscribers: 3.3M subscribers, Videos: 1.9K videos
Processing: Annabell Newman - https://w