In [1]:
import pandas as pd
import numpy as np
import requests
import base64
from datetime import datetime, timedelta
from dateutil import tz
import sys
import time
import json
import os
from dotenv import load_dotenv
from pandas import json_normalize
from glob import glob
from pathlib import Path

In [55]:
class YoutubeScraper:
    BASE_URL="https://www.googleapis.com"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}


    def search(self,
               query: str,
               type_: str = 'video',
               max_result: int= 50,
               ):
        
        if not isinstance(max_result,int):
            raise TypeError('Input number 1 - 50 only')
        if max_result > 50 or max_result < 1:
            raise ValueError('Input number 1 - 50 only')
    
        if not isinstance(type_,str):
            raise TypeError('Input video|playlist|channel')
        if type_ != 'video' and type_ !='playlist' and type_ !='channel':
            raise ValueError("Input video|playlist|channel")
        
        ENDPOINT_AUTH = "/youtube/v3/search"
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        folder_name = os.path.join("data", f"search-{query}-{timestamp}")
        os.makedirs(folder_name, exist_ok=True)
        response = {}

        while True:
            current_key = self.api_key
            params = {'part':'snippet',
                      'q':query,
                      'type':type_,
                      "maxResults": max_result,
                      'key' : current_key
                      }
            nextpage = response.get("nextPageToken", None)
            print(f"Getting page {nextpage}")

            if nextpage is None:
                r = requests.get(
                    f"{self.BASE_URL}{ENDPOINT_AUTH}",
                    params=params,
                    headers=self.header,
                )
                filename = f'{query}.json'
            else:
                params["pageToken"] = nextpage
                r = requests.get(
                    f"{self.BASE_URL}{ENDPOINT_AUTH}",
                    params=params,
                    headers=self.header,
                )
                filename=f'{query}-{nextpage}.json'
            if r.status_code == 200:
                print(f"Code 200 - Using Quota")
                response = r.json()
                nextpage = response.get("nextPageToken", None)
                file_path = os.path.join(folder_name, filename)

                with open(file_path, "w", encoding="utf-8") as file:
                    json.dump(response, file, ensure_ascii=False, sort_keys=True, indent=2)

                print(f"Saved Json file page {filename}")
                if nextpage is None:
                    print("Collected All Search")
                    break

            elif r.status_code == 403:
                print(
                    f"Key reached limit quota"
                )
                continue

            else:
                print(f"code {r.status_code}")
                break
        return folder_name
    

    def video(self, id): #dataframe: pd.DataFrame = vy):
        ENDPOINT_AUTH = '/youtube/v3/videos'
        folder_name = os.path.join("data", f"video")
        id = 'XDhmlW6wfmY' #dataframe['video_id'].tolist()
        response = {}

        while True:
            current_key = self.api_key
            params = {
                'part':'statistics',
                'id':id,
                'maxResults':1,
                'key':current_key,
                'chart':'mostPopular',
                'regionCode':'AU'
                }
            nextpage = response.get("nextPageToken", None)
            print(f"Getting page video ID {nextpage} information")

            if nextpage is None:
                r = requests.get(
                    f"{self.BASE_URL}{ENDPOINT_AUTH}",
                    params=params,
                    headers=self.header,
                )
                filename = f'{id}.json'
            else:
                params["pageToken"] = nextpage
                r = requests.get(
                    f"{self.BASE_URL}{ENDPOINT_AUTH}",
                    params=params,
                    headers=self.header,
                )
                filename=f'{id}-{nextpage}.json'
            if r.status_code == 200:
                print(f"Code 200 - Using Quota")
                response = r.json()
                nextpage = response.get("nextPageToken", None)
                file_path = os.path.join(folder_name, filename)

                with open(file_path, "w", encoding="utf-8") as file:
                    json.dump(response, file, ensure_ascii=False, sort_keys=True, indent=2)

                print(f"Saved Json file page {filename}")
                if nextpage is None:
                    print("Collected Video ID")
                    break

            elif r.status_code == 403:
                print(
                    f"Key reached limit quota"
                )
                continue

            else:
                print(f"code {r.status_code}")
                break
        return folder_name   

In [None]:
YoutubeScraper('AIzaSyBMIKqHzQlu8MId-9ocqSZO-MN0NVedSuM').search(query="Visa 500",type_='video',max_result=50)

TypeError: YoutubeScraper.__init__() missing 1 required positional argument: 'path'

In [None]:
def process_json (path: Path):
    all_data =[]

    for p in path:
        try:
            with open(p, 'r', encoding='utf-8') as file:
                record = json.load(file)
        except json.JSONDecodeError as e: 
            print(f'Check {p}: {e}')
        except Exception as e:
            print(f'check {p}: {e}')
            continue

        try:
            if 'items' in record and isinstance(record['items'], list):
                for item in record['items']:
                    try:
                        snippet = item['snippet']
                        video_id = item['id']['videoId']
                        
                        flat_record = {
                            'channel_id': snippet.get('channelId'),
                            'channel_title': snippet.get('channelTitle'),
                            'video_id': video_id,
                            'title': snippet.get('title'),
                            'publish_time': snippet.get('publishTime'),
                        }
                        all_data.append(flat_record)
                    except:
                        print (f'check {item}') 
            else:
                print(f"check file (no 'items'): {p}")
        except:
            print(f'check {p}: {e}')

    return pd.DataFrame(all_data)

In [54]:
vy = df = process_json (glob(str(Path('data') / "search-Visa 500-*" / "*.json")))
vy


Unnamed: 0,channel_id,channel_title,video_id,title,publish_time
0,UCJDTuY6qTMY1SoRd6gliPPQ,VISA Connection,XDhmlW6wfmY,How to Get Scholarships to Study in Australia ...,2025-06-25T08:29:09Z
1,UCIrXO3aq2TP-6XJbrSyLU1w,Think Higher Consultants,vAFQSL5DF04,Client Reviews | Subclass 500 Student Visa | 4...,2022-06-09T11:42:23Z
2,UCIyM8_Th3NJEUQWNY8Paa-w,Migration Centre of Australia,cDAi-esDKew,8 Step Instructions for a Student Dependent Vi...,2018-07-16T06:25:16Z
3,UCYY4WKKIo12sPuXqDirtKKA,Varsha's Australian Diary,2UwufXjXs_8,Student visa requirements for Australia | Subc...,2021-07-28T07:17:39Z
4,UCIyM8_Th3NJEUQWNY8Paa-w,Migration Centre of Australia,RxvDN4fUwQ0,🇦🇺 STUDENT VISA WORK RULES EXPLAINED (2025) | ...,2025-02-28T12:15:03Z
...,...,...,...,...,...
598,UCAHrTblEqBQ20V2arcrRp2w,Law and Visas,599uZDotSqw,How to Apply for an Australian Student Visa in...,2025-06-11T09:39:42Z
599,UCbL-XsxlqpuKzXaVBFeSDdA,Australia Visa Immigration Discussion,d6pU7ev5fGo,Australian Student Visa Subclass 500 - How to ...,2022-09-10T12:40:34Z
600,UCN2Gg6C_8azK0pjxzAhINSg,Boring videos,vK0HtKYZ2hY,What to Do After Your Student Visa 500 Australia,2024-07-15T10:03:52Z
601,UCIrXO3aq2TP-6XJbrSyLU1w,Think Higher Consultants,--JBtLL0WGs,Australia Study Visa Documents Checklist | Stu...,2024-01-22T09:36:39Z


In [59]:
YoutubeScraper('AIzaSyBMIKqHzQlu8MId-9ocqSZO-MN0NVedSuM').video('XDhmlW6wfmY')


Getting page video ID None information
code 400


'data\\video'