In [2]:
from googleapiclient.discovery import build
import json

# Load the secrets from the JSON file
with open('client_secrets.json', 'r') as file:
    secrets = json.load(file)

# Set up the API key and YouTube API client
api_key = secrets['api_key']  
youtube = build('youtube', 'v3', developerKey=api_key)

In [3]:
import pandas as pd

def get_video_ids(query, max_results=50):
    video_ids = []
    next_page_token = None

    # Add a keyword to the query to search specifically for Shorts
    modified_query = query + " #shorts"

    while len(video_ids) < max_results:
        # Fetch search results
        request = youtube.search().list(
            part="id",
            q=modified_query,
            type="video",
            maxResults=50,  # Adjust as needed (max 50 per request)
            pageToken=next_page_token
        )
        response = request.execute()

        # Extract video IDs
        for item in response.get('items', []):
            video_ids.append(item['id']['videoId'])
            if len(video_ids) >= max_results:
                break

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    # Create a DataFrame
    df = pd.DataFrame({'Query': [query], 'Video_IDs': [video_ids]})
    return df


#### Use for Single Keyword

In [28]:
# Example use
#video_ids = get_video_ids("car")  # Replace with your desired search query and optional max results (standard is 50)
#print(video_ids)

#### Read CSV of search Terms

##### Keep in mind to exclude the ones we already have

Example Query for GPT: *Provide 50 different Search Terms for obtaining Youtube Shorts via an Python Script that uses youtube search API. Make sure to cover a diverse list of topics and exclude {Terms already in the video_ids.csv} Output a csv* 

In [9]:
search_terms_df = pd.read_csv('youtube_shorts_search_terms.csv')
combined_results_df = pd.DataFrame()

##### Loop over the Search Terms and get Video IDs for every Term

In [10]:
from tqdm import tqdm

for search_term in tqdm(search_terms_df['Search Terms'], desc="Processing Search Terms"):
    result_df = get_video_ids(search_term)
    
    # Combine the result with the combined DataFrame
    combined_results_df = pd.concat([combined_results_df, result_df], ignore_index=True)

Processing Search Terms: 100%|██████████| 40/40 [00:12<00:00,  3.09it/s]


In [11]:
combined_results_df.head()

Unnamed: 0,Query,Video_IDs
0,Quick Recipe,"[9-8LUW5tqBg, WobUAaSEiB0, hRZHgR8cU3k, d2jlcF..."
1,Fitness Tips,"[_YjNSC7Lq00, 5rdn9C9cnlE, Kj00H4-5b2E, JV1ZY4..."
2,DIY Crafts,"[sBYiuaBCSkA, 9GDsH1EdQqw, dvHMHCRGudM, oOzxMQ..."
3,Travel Guide,"[EIRSomyJHyo, i91Sg2YbYY8, g8YVCUB5klI, e9O4yK..."
4,Fashion Haul,"[PC7OGIiI5u4, lMDrA8v8m3s, 5NO3_uOeb1I, L1zBjS..."


##### Merge with Existing Video IDs to keep track of which Search Terms we already obtained 

In [12]:
curr_video_ids = pd.read_csv('used_search_queries.csv')
df_merged = pd.concat([curr_video_ids, combined_results_df], ignore_index=True)
df_merged.head()

Unnamed: 0,Query,Video_IDs
0,funny,"['P_1-Cpo1P0A', 'FRdgTgKxnx4', '7dUY12c4Xwo', ..."
1,car,"['5PH0jXruu5E', 'HVhF_ty_ZT0', '6tZn6Jgha3w', ..."
2,Quick Recipe,"[9-8LUW5tqBg, WobUAaSEiB0, hRZHgR8cU3k, d2jlcF..."
3,Fitness Tips,"[_YjNSC7Lq00, 5rdn9C9cnlE, Kj00H4-5b2E, JV1ZY4..."
4,DIY Crafts,"[sBYiuaBCSkA, 9GDsH1EdQqw, dvHMHCRGudM, oOzxMQ..."


In [13]:
df_merged.to_csv('used_search_queries.csv', index=False)  # Save to CSV file

##### Save new Results seperately to use in YoutubeShortsExtractor and then HuggingChat.ipynb

In [14]:
combined_results_df.to_csv('youtube_shorts_video_ids_new.csv', index=False)