# YouTube Channel Video Title Scraper

This notebook uses `Selenium` with `undetected_chromedriver` to extract recent video titles from a list of YouTube channels.

## Input
- `channels.xlsx` — Excel file with two columns:
  - `Channel Name`
  - `Channel Link`

## How It Works
- Opens each channel in a browser
- Scrolls to load up to 50 video titles
- Cleans text using regex
- Randomly waits between actions to avoid detection
- Changes user-agent every ~20 channels

## Output
- `youtube_channel_titles.xlsx` — Excel file with cleaned video titles

## Requirements

Install these before running the notebook:

```bash
pip install selenium undetected-chromedriver pandas openpyxl fake-useragent

In [1]:
import time
import random
import pandas as pd
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import re

#Replace paths as required
input_excel = "channels.xlsx"
output_excel = "youtube_channel_titles.xlsx"

# Function to setup ChromeDriver with fake useragent and options
def setup_driver():
    ua = UserAgent()
    options = uc.ChromeOptions()
    user_agent = ua.chrome
    options.add_argument(f'user-agent={user_agent}')
    options.add_argument('--incognito')
    
    '''
    Running in headless mode can increase the risk of being blocked by anti-bot systems.  
    If you plan to scrape more than 100 channels, it’s best to keep headless mode disabled.  
    To enable headless mode, simply remove the comment (#) from the line below.
    '''
    # options.add_argument('--headless')


    # Additional options for avoiding detection
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = uc.Chrome(options=options)
    driver.set_window_size(1200, 800)
    return driver

def clean_title(title):
    cleaned = re.sub(r"[^\w\s.,:;!?'\-\"()\[\]{}<>|]", "", title, flags=re.UNICODE)
    return cleaned

def random_wait(min_sec=3, max_sec=7):
    time.sleep(random.uniform(min_sec, max_sec))


def get_video_titles(channel_url, driver=None, max_titles=50):
    # If no driver is passed, create a new one
    if driver is None:
        options = Options()
        options.add_argument("--headless")
        driver = webdriver.Chrome(options=options)
        created_driver = True
    else:
        created_driver = False

    driver.get(channel_url)
    time.sleep(3)

    titles = set()
    last_height = driver.execute_script("return document.documentElement.scrollHeight")

    while len(titles) < max_titles:
        # Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(2)

        # Get video titles
        title_elements = driver.find_elements(By.ID, "video-title")
        for elem in title_elements:
            text = elem.text.strip()
            if text:
                titles.add(text)
            if len(titles) >= max_titles:
                break

        # Check if new content loaded
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break  # No more new content
        last_height = new_height

    if created_driver:
        driver.quit()

    return list(titles)[:max_titles]

def main(input_excel_path, output_excel_path):
    df = pd.read_excel(input_excel_path)

    driver = setup_driver()

    recent_titles = []
    all_titles = []
    
    # Interval for agent change (random between 20-25) (helps in avoiding detection)
    change_interval = random.randint(20, 25)
    
    for idx, row in df.iterrows():
        # Change user-agent and restart driver at interval
        if idx % change_interval == 0:
            if driver:
                driver.quit()
            ua = UserAgent()
            user_agent = ua.chrome
            print(f"Changing user-agent at iteration {idx}: {user_agent}")
            
            options = uc.ChromeOptions()
            options.add_argument(f'user-agent={user_agent}')
            options.add_argument('--incognito')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-infobars')
            options.add_argument('--disable-dev-shm-usage')
            driver = uc.Chrome(options=options)
            driver.set_window_size(1200, 800)
            
            change_interval = idx + random.randint(20, 25)
            
        channel_name = row['Channel Name']
        channel_link = row['Channel Link']

        print(f"Processing: {channel_name} - {channel_link}")
        #subs, videos = get_subscriber_and_video_counts(driver, channel_link)
        video_titles = get_video_titles(channel_link, driver)
        for title in video_titles:
            all_titles.append(title)
        
        # Random wait to avoid detection
        random_wait(3,7)

    driver.quit()
    cleaned_titles = [clean_title(t) for t in all_titles]
    print(cleaned_titles)
    final_title_df = pd.DataFrame(cleaned_titles, columns=['titles'])
    # Save to Excel
    final_title_df.to_excel(output_excel, index=False)
    
    
if __name__ == "__main__":
    main(input_excel, output_excel)


Changing user-agent at iteration 0: Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/135.0.7049.53 Mobile/15E148 Safari/604.1
Processing: Emma Hubbard - https://www.youtube.com/@EmmaHubbard/videos
Processing: Healthy Child - https://www.youtube.com/@HealthyChild/videos
Processing: c4cooking channel - https://www.youtube.com/@c4cookingchannel/videos
Processing: Ishinna B. Sadana - https://www.youtube.com/@DrIshinnaSadana/videos
['3 Things to Avoid That Can Impact Development', 'Child Not Listening? Stop Yelling. Do This Instead', 'How This Innocent Mistake Stops Your Toddler From Playing Alone (And 3 Fixes!)', '3 Common Mistakes That Ruin Baby Sleep', 'This Mistake Teaches Your Toddler to Whine', '5 Parenting Lessons to Reduce Unnecessary Stress', 'STOP Talking When Your Child Melts Down. Do THIS Instead', 'Best Toys For Newborn Babies (From a Pediatric OT)', 'New Parents - Youll Wish You Knew This Sooner...', 'This Is Why Your Baby