In [1]:
from googleapiclient.discovery import build
from pytube import YouTube
import pandas as pd
import re
import os

# YouTube API key
API_KEY = "AIzaSyDO5vUUjJCBX1q1VKsqGBOYoCzlgARgYxs"

# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Mapping of YouTube category IDs to category names
CATEGORY_MAPPING = {
    "1": "Film & Animation",
    "2": "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "19": "Travel & Events",
    "20": "Gaming",
    "21": "People & Blogs",
    "22": "Comedy",
    "23": "Entertainment",
    "24": "News & Politics",
    "25": "Howto & Style",
    "26": "Education",
    "27": "Science & Technology",
    "28": "Nonprofits & Activism",
    "29": "Movies",
    "30": "Shows",
    "31": "Trailers"
}

def get_videos(genre, max_results=500):
    """Fetch top videos for a specific genre."""
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        request = youtube.search().list(
            part="id",
            q=genre,
            type="video",
            maxResults=min(50, max_results - len(video_ids)),
            pageToken=next_page_token
        )
        response = request.execute()

        # Collect video IDs
        for item in response.get('items', []):
            video_ids.append(item['id']['videoId'])

        # Check for more pages
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids

def get_video_details(video_ids):
    """Fetch video details using video IDs."""
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics,topicDetails,recordingDetails",
            id=",".join(video_ids[i:i+50])
        )
        response = request.execute()

        for item in response.get('items', []):
            # Get the category name from the mapping
            category_name = CATEGORY_MAPPING.get(item['snippet']['categoryId'], "Unknown")
            
            details.append({
                "Video URL": f"https://www.youtube.com/watch?v={item['id']}",
                "Title": item['snippet']['title'],
                "Description": item['snippet'].get('description', ""),
                "Channel Title": item['snippet']['channelTitle'],
                "Keyword Tags": ", ".join(item['snippet'].get('tags', [])),
                "YouTube Video Category": category_name,
                "Topic Details": item.get('topicDetails', {}).get('topicCategories', []),
                "Video Published at": item['snippet']['publishedAt'],
                "Video Duration": item['contentDetails']['duration'],
                "View Count": item['statistics'].get('viewCount', 0),
                "Comment Count": item['statistics'].get('commentCount', 0),
                "Captions Available": item['contentDetails'].get('caption', "false") == "true",
                "Location of Recording": item.get('recordingDetails', {}).get('location', {})
            })
    return details

def download_captions(video_url, save_path):
    """Download captions for a video."""
    try:
        yt = YouTube(video_url)
        caption = yt.captions.get_by_language_code('en')
        if caption:
            caption_text = caption.generate_srt_captions()
            # Save the captions to a file
            video_id = re.search(r"v=(.+)", video_url).group(1)
            file_path = os.path.join(save_path, f"{video_id}_captions.srt")
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(caption_text)
            return caption_text
    except Exception as e:
        print(f"Error downloading captions for {video_url}: {e}")
        return None

def main():
    # Dynamic input for genre
    genre = input("Enter the genre to search for (e.g., 'Data Analyst'): ")
    max_results = 500

    print("Fetching video IDs...")
    video_ids = get_videos(genre, max_results)

    print("Fetching video details...")
    video_details = get_video_details(video_ids)

    # Prepare folder for captions
    captions_folder = "captions"
    if not os.path.exists(captions_folder):
        os.makedirs(captions_folder)

    print("Downloading captions...")
    for video in video_details:
        if video["Captions Available"]:
            video["Caption Text"] = download_captions(video["Video URL"], captions_folder)
        else:
            video["Caption Text"] = None

    print("Saving data to CSV...")
    df = pd.DataFrame(video_details)
    df.to_csv(f"{genre}_videos.csv", index=False, encoding='utf-8')

    print(f"Data saved to '{genre}_videos.csv'. Captions saved in '{captions_folder}'.")

if __name__ == "__main__":
    main()


Enter the genre to search for (e.g., 'Data Analyst'):  gym


Fetching video IDs...
Fetching video details...
Downloading captions...


  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')
  caption = yt.captions.get_by_language_code('en')


Saving data to CSV...
Data saved to 'gym_videos.csv'. Captions saved in 'captions'.


  caption = yt.captions.get_by_language_code('en')


In [4]:
from googleapiclient.discovery import build
from pytube import YouTube
import pandas as pd
import re
import os

# YouTube API key
API_KEY = "AIzaSyBC5VXY0Nh5-ncCJmzsfYVG5jF9fXT3muI"

# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Mapping of YouTube category IDs to category names
CATEGORY_MAPPING = {
    "1": "Film & Animation",
    "2": "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "19": "Travel & Events",
    "20": "Gaming",
    "21": "People & Blogs",
    "22": "Comedy",
    "23": "Entertainment",
    "24": "News & Politics",
    "25": "Howto & Style",
    "26": "Education",
    "27": "Science & Technology",
    "28": "Nonprofits & Activism",
    "29": "Movies",
    "30": "Shows",
    "31": "Trailers"
}

def get_videos(genre, max_results=500):
    """Fetch top videos for a specific genre."""
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        request = youtube.search().list(
            part="id",
            q=genre,
            type="video",
            maxResults=min(50, max_results - len(video_ids)),
            pageToken=next_page_token
        )
        response = request.execute()

        # Collect video IDs
        for item in response.get('items', []):
            video_ids.append(item['id']['videoId'])

        # Check for more pages
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids

def get_video_details(video_ids):
    """Fetch video details using video IDs."""
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics,topicDetails,recordingDetails",
            id=",".join(video_ids[i:i+50])
        )
        response = request.execute()

        for item in response.get('items', []):
            # Get the category name from the mapping
            category_name = CATEGORY_MAPPING.get(item['snippet']['categoryId'], "Unknown")
            
            # Extract keywords from the description using regex
            description = item['snippet'].get('description', "")
            keywords = re.findall(r'\b\w+\b', description)  # Extract words as keywords
            
            # Extract caption text
            caption_text = description  # Assuming the caption text is stored in the description field
            
            # Determine if captions are available based on Caption Text
            captions_available = "true" if caption_text else "false"
            
            details.append({
                "Video URL": f"https://www.youtube.com/watch?v={item['id']}",
                "Title": item['snippet']['title'],
                "Description": description,
                "Channel Title": item['snippet']['channelTitle'],
                "Keyword Tags": ", ".join(keywords),
                "YouTube Video Category": category_name,
                "Topic Details": item.get('topicDetails', {}).get('topicCategories', []),
                "Video Published at": item['snippet']['publishedAt'],
                "Video Duration": item['contentDetails']['duration'],
                "View Count": item['statistics'].get('viewCount', 0),
                "Comment Count": item['statistics'].get('commentCount', 0),
                "Captions Available": captions_available == "true",  # Updated Captions Available logic
                "Caption Text": caption_text,  # Caption Text now based on description
                "Location of Recording": item.get('recordingDetails', {}).get('location', {})
            })
    return details

def main():
    # Dynamic input for genre
    genre = input("Enter the genre to search for (e.g., 'Data Analyst'): ")
    max_results = 500

    print("Fetching video IDs...")
    video_ids = get_videos(genre, max_results)

    print("Fetching video details...")
    video_details = get_video_details(video_ids)

    print("Saving data to CSV...")
    df = pd.DataFrame(video_details)
    df.to_csv(f"{genre}_videos.csv", index=False, encoding='utf-8')

    print(f"Data saved to '{genre}_videos.csv'.")

if __name__ == "__main__":
    main()


Enter the genre to search for (e.g., 'Data Analyst'):  data analytics


Fetching video IDs...
Fetching video details...
Saving data to CSV...
Data saved to 'data analytics_videos.csv'.
