In [None]:
import pandas as pd
import numpy as np
import requests
import base64
from datetime import datetime, timedelta
from dateutil import tz
import sys
import time
import json
import os
from dotenv import load_dotenv
from pandas import json_normalize
from glob import glob
from pathlib import Path


## Scraping

### Auto get API key from Key List (.env)

In [14]:
def get_api_key_auto(env='.env'):
    load_dotenv(env)
    api_keys = []
    i = 1

    while True:
        key = os.getenv(f"YOUTUBE_API_KEY_{i}")
        if key is None:
            break
        api_keys.append(key)
        i += 1

    print(f"Using {len(api_keys)} API Keys")
    return api_keys


API_KEYS = get_api_key_auto()

Using 1 API Keys


### Calculate how long until midnight (Pacific Time Zone)? Normally, Youtube API Quota will reset at midnight

In [None]:
class YoutubeScaper:
    def __init__(self,
                 env_path: str = '.env',

                 ):
        self.env_path:str = env_path

        
    def get_api_key_auto(self) -> list[str]:
        load_dotenv(self.env_path)
        api_keys: list[str] = []
        i = 1

        while True:
            key = os.getenv(f"YOUTUBE_API_KEY_{i}")
            if key is None:
                break
            api_keys.append(key)
            i += 1

        print(f"Using {len(api_keys)} API Keys")
        return api_keys

    def cal_next_reset(current: datetime) -> int:  # (1) Due to YouTube API request limits being reset at midnight Pacific Time
        next_rese = datetime.now(tz.gettz("America/Los_Angeles")) + timedelta(days=1)
        next_reset = next_reset.replace(
            hour=0, minute=0, second=1, microsecond=0
        )  # second=1 to make sure systems of Youtube is on same day
        reset_remaining = next_reset - current
        return reset_remaining.seconds + 1 


sleep_time = (
    cal_next_reset()
)  # (1) System will stop making requests once a 403 error is returned and calculate the remaining time until the reset.

### Auto change to new key when previous key is out of quota

In [None]:
class APIKeyManager:
    def __init__(self, env_path: str = '.env' ):
      	self.env_path = Path(env_path)
        if not env_path.exists():
            raise FileNotFoundError(f"{env_path.absolute()} not found") 
        self.current_key_index = 0
        self.failed_keys = set()


    def get_api_key_auto(self):
        load_dotenv(self.env_path)
        api_keys = []
        i = 1

        while True:
            key = os.getenv(f"YOUTUBE_API_KEY_{i}")
            if key is None:
                break
            api_keys.append(key)
            i += 1

        print(f"Using {len(api_keys)} API Keys")
        return api_keys

    def get_next_active_key(self, api_keys):
        for _ in range(len(api_keys)):
            key = api_keys[self.current_key_index]
            if key not in self.failed_keys:
                return key, self.current_key_index

            self.current_key_index = (
                self.current_key_index + 1
            ) % len(api_keys)

        print(
            f"Reached Quota Limit of All Keys, reset after {sleep_time} seconds"
        )
        time.sleep(sleep_time)
        self.failed_keys.clear()
        self.current_key_index = 0
        return self.api_keys[0], 0

    def mark_key_as_failed(self, key):
        self.failed_keys.add(key)


key_manager = APIKeyManager(API_KEYS)

In [5]:
def scrape_with_auto_keys():  # Scrape auto load and change API Keys
    global response
    data, page_count = [], 0

### Create Parameters to Scrape Youtube Search API

In [6]:
def get_params(key, keyword, 
               #max_result = 50, 
               type = 'video'):
    #if not isinstance(max_result,int):
        #raise TypeError('Input number 0 - 50 only')
    #if max_result > 50 or max_result < 0:
        #raise ValueError('Input number 0 - 50 only')
    
    if not isinstance(type,str):
        raise TypeError('Input video|playlist|channel')
    if type != 'video' and type !='playlist' and type !='channel':
        raise ValueError("Input video|playlist|channel")

    return{
        'part':'snippet',
        'q':keyword,
        'type': type,
        #'maxResults': max_result,
        'key': key}

### Scraping and Saving Results Page into Json File

In [None]:
def youtube_search(query, api_manager):
    
    BASE_URL = "https://www.googleapis.com"
    ENDPOINT_AUTH = "/youtube/v3/search"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    response = {}
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    os.makedirs("data", exist_ok=True)
    folder_name = os.path.join("data", f'search-{query}-{timestamp}')
    os.makedirs(folder_name, exist_ok=True)


    while True:
        current_key, key_idx = api_manager.get_next_active_key()
        params = get_params(keyword=query, key=current_key, type='video')
        nextpage = response.get("nextPageToken", None)
        prevpage = response.get("prevPageToken", None)
        print(f"Getting page {nextpage}")

        if nextpage is None:
            r = requests.get(
                f"{BASE_URL}{ENDPOINT_AUTH}",
                params=params,
                headers=headers,
            )
            filename = f'{query}.json'
        else:
            params["pageToken"] = nextpage
            r = requests.get(
                f"{BASE_URL}{ENDPOINT_AUTH}",
                params=params,
                headers=headers,
            )
            filename=f'{query}-{nextpage}.json'
        if r.status_code == 200:
            print(f"Code 200 - Using Quota for key {key_idx+1}")
            response = r.json()
            nextpage = response.get("nextPageToken", None)
            prevpage = response.get("prevPageToken", None)
            
            file_path = os.path.join(folder_name, filename)

            with open(file_path, "w", encoding="utf-8") as file:
                json.dump(response, file, ensure_ascii=False, sort_keys=True, indent=2)

            print(f"Saved Json file page {filename}")
            if nextpage is None:
                print("Collected All Data")
                break

        elif r.status_code == 403:
            print(
                f"Key {key_idx+1} reached limit quota, change to next key"
            )
            key_manager.mark_key_as_failed(current_key)
            continue

        else:
            print(f"code {r.status_code}")
            break
    return folder_name
  

folder, pages = youtube_search('Badminton AU',key_manager)

Getting page None
Code 200 - Using Quota for key 1
Saved Json file to page before page CAUQAA
Getting page CAUQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CAoQAA
Getting page CAoQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CA8QAA
Getting page CA8QAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CBQQAA
Getting page CBQQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CBkQAA
Getting page CBkQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CB4QAA
Getting page CB4QAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CCMQAA
Getting page CCMQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CCgQAA
Getting page CCgQAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CC0QAA
Getting page CC0QAA
Code 200 - Using Quota for key 1
Saved Json file to page before page CDIQAA
Getting page CDIQAA
Code 200 - Using Quota

: 

: 

## Wrangling

### Convert All Result Json File into Dataframe

In [None]:
def process_jsonl (data):
    all_data = []

    with open(data, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file,1):
            try:
                record = json.loads(line.strip())

                if 'items' in record and isinstance(record['items'], list):
                    for item in record['items']:
                        try:
                            snippet = item['snippet']
                            video_id = item['id']['videoId']
                            
                            flat_record = {
                                'channel_id': snippet.get('channelId'),
                                'channel_title': snippet.get('channelTitle'),
                                'video_id': video_id,
                                'title': snippet.get('title'),
                                'publish_time': snippet.get('publishTime'),
                            }
                            all_data.append(flat_record)
                        except:
                            print (f'check {item}')  
                else:
                    print(f"check line {line_num}")
                    
            except json.JSONDecodeError as e: 
                print(f'Check {line_num}: {e}')
            except Exception as e:
                print(f'check {line_num}: {e}')

    return pd.DataFrame(all_data)

df = process_jsonl('data.jsonl')
df

In [11]:
def process_json (path, root='data'):
    all_data =[]

    for p in path:
        try:
            with open(p, 'r', encoding='utf-8') as file:
                record = json.load(file)
        except json.JSONDecodeError as e: 
            print(f'Check {p}: {e}')
        except Exception as e:
            print(f'check {p}: {e}')
            continue

        try:
            if 'items' in record and isinstance(record['items'], list):
                for item in record['items']:
                    try:
                        snippet = item['snippet']
                        video_id = item['id']['videoId']
                        
                        flat_record = {
                            'channel_id': snippet.get('channelId'),
                            'channel_title': snippet.get('channelTitle'),
                            'video_id': video_id,
                            'title': snippet.get('title'),
                            'publish_time': snippet.get('publishTime'),
                        }
                        all_data.append(flat_record)
                    except:
                        print (f'check {item}') 
            else:
                print(f"check file (no 'items'): {p}")
        except:
            print(f'check {p}: {e}')

    return pd.DataFrame(all_data)

In [13]:
df = process_json (glob(str(Path('data') / "search-Badminton AU-*" / "*.json")), 'data')
df

Unnamed: 0,channel_id,channel_title,video_id,title,publish_time
0,UC3f7HyZbkVVhGOkJpVCx41w,Howard Shu,0KrINd9YDsQ,Smashing with the Yonex Nanoflare 1000Z 💥 #rac...,2023-06-11T03:40:49Z
1,UCkmf_8nuJgEY61EhXud0GKw,SportsScreen,UHaAHp9bQEU,Australia vs India Men&#39;s Singles | LIN VS...,2022-11-10T13:20:37Z
2,UCw-ek__q-Uah6GyHevx9BOQ,Sports Insights,METJbTlKRcw,Australian 2024 Corporate Games Badminton Mens...,2024-11-18T18:46:28Z
3,UCnVfEREu3C1_CV_lg3iRAog,Aylex Badminton Academy,Ie9sSTMbPRA,This is why we play badminton #aylex #badminto...,2025-01-24T19:29:31Z
4,UCgzigI1M0de6aKrTE-u9FWA,Badminton Victoria,9wkhZn5t_W0,Shuttle Time Coaching Courses - Badminton Vict...,2019-06-25T05:44:01Z
...,...,...,...,...,...
495,UCa_ENgTDV__Qja6ZdLRaHng,An Badminton,OV8ag8lOomU,How we actually play badminton 🏸,2025-06-24T03:44:53Z
496,UCWZS4Nevk6PXM6EXdHdRjOg,Badminton Click,NVfgCSCBYJE,"Badminton Click | About Us (Melbourne, Australia)",2024-02-07T07:50:05Z
497,UCHAGOPM0V7_j5ZMlSyFKj-w,Kamal’s sports Masterclass,e5tcyTuompc,Quick Badminton Warm-Up | Boost Your Game &amp...,2025-09-11T16:08:17Z
498,UCWOsv2AE1pfd9xHiEiJvLFQ,AFM Badminton,bxnbLp-Q8iU,Victor P9200 (Hang) Badminton Shoes {Unboxing},2024-08-21T10:54:43Z
