In [1]:
pip install pytube youtube-transcript-api

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install google-api-python-client




### V1 : Scraping Youtuber,video_url and Transcript

In [1]:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi

def get_video_transcript(url):
  """
  This function takes a YouTube video URL as input, scrapes the transcript
  using the youtube-transcript-api library, and returns the data in a dictionary.

  Args:
      url (str): The URL of the YouTube video.

  Returns:
      dict: A dictionary containing the YouTuber's channel name, video URL,
            and the transcript of the video. 
            Returns None if transcript is not available.
  """

  try:
    video = YouTube(url)
    transcript = YouTubeTranscriptApi.get_transcript(video.video_id)

    # Format the transcript 
    formatted_transcript = ' '.join([entry['text'] for entry in transcript])

    return {
        "YouTuber": video.author,
        "Video URL": url,
        "Transcript": formatted_transcript
    }

  except Exception as e:
    print(f"An error occurred: {e}")
    return None

# Get the YouTube video URL from the user
video_url = input("Enter the YouTube video URL: ")

# Get the transcript
video_data = get_video_transcript(video_url)

if video_data:
  print("YouTuber:", video_data['YouTuber'])
  print("Video URL:", video_data['Video URL'])
  print("Transcript:", video_data['Transcript'])

KeyboardInterrupt: Interrupted by user

### V2 : Scraping Youtuber,channel_name, video_url,Transcript and comments

In [None]:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
import os


# Your YouTube Data API v3 Key (replace with your actual API key)
api_key = 'AIzaSyBrLoDDOuyV2iPMDeVXOjHjkwTYxPKY3MA' 

youtube = build('youtube', 'v3', developerKey=api_key)

def get_video_info(url):
    """
    This function takes a YouTube video URL, scrapes transcript, 
    channel name, and comments using the YouTube Data API v3.

    Args:
        url (str): The URL of the YouTube video.

    Returns:
        dict: A dictionary containing the YouTuber's channel name, 
              video URL, transcript, and comments. Returns None if 
              transcript is not available or an error occurs.
    """

    try:
        video = YouTube(url)
        video_id = video.video_id
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatted_transcript = ' '.join([entry['text'] for entry in transcript])

        # Get channel details
        channel_response = youtube.channels().list(
            part='snippet',
            id=video.channel_id
        ).execute()
        channel_name = channel_response['items'][0]['snippet']['title']

        # Get comments
        comments = []
        results = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            maxResults=100  # Fetch up to 100 comments
        ).execute()

        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        return {
            "YouTuber": video.author,
            "Channel Name": channel_name,
            "Video URL": url,
            "Transcript": formatted_transcript,
            "Comments": comments
        }

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Get YouTube video URL from the user
video_url = input("Enter the YouTube video URL: ")

# Get the video information
video_data = get_video_info(video_url)

if video_data:
    print("YouTuber:", video_data['YouTuber'])
    print("Channel Name:", video_data['Channel Name'])
    print("Video URL:", video_data['Video URL'])
    print("Transcript:", video_data['Transcript'])
    print("Comments:")
    for comment in video_data['Comments']:
        print("- ", comment)

In [None]:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
import pandas as pd
print(os.getcwd())
desired_directory = r"C:\slmwork" # Replace with your desired path
os.chdir(desired_directory)

# Your YouTube Data API v3 Key (replace with your actual API key)
api_key = 'AIzaSyBrLoDDOuyV2iPMDeVXOjHjkwTYxPKY3MA'

youtube = build('youtube', 'v3', developerKey=api_key)

def get_video_info(url):
    """
    This function takes a YouTube video URL, scrapes transcript,
    channel name, and comments using the YouTube Data API v3.

    Args:
        url (str): The URL of the YouTube video.

    Returns:
        dict: A dictionary containing the YouTuber's channel name,
              video URL, transcript, and comments. Returns None if
              transcript is not available or an error occurs.
    """

    try:
        video = YouTube(url)
        video_id = video.video_id
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatted_transcript = ' '.join([entry['text'] for entry in transcript])

        # Get channel details
        channel_response = youtube.channels().list(
            part='snippet',
            id=video.channel_id
        ).execute()
        channel_name = channel_response['items'][0]['snippet']['title']

        # Get comments
        comments = []
        results = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            maxResults=100  # Fetch up to 100 comments
        ).execute()

        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        return {
            "YouTuber": video.author,
            "Channel Name": channel_name,
            "Video URL": url,
            "Transcript": formatted_transcript,
            "Comments": comments
        }

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# --- Main Execution --- 

# Create an empty DataFrame to store all video data
all_video_data = pd.DataFrame(columns=["YouTuber", "Channel Name", "Video URL", "Transcript", "Comments"])

# Get YouTube video URLs from the user 
video_urls = []
while True:
    url = input("Enter a YouTube video URL (or type 'done' to finish): ")
    if url.lower() == 'done':
        break
    video_urls.append(url)

# Iterate through video URLs, fetch data, and append to DataFrame
for video_url in video_urls:
    video_data = get_video_info(video_url)

    if video_data:
        temp_df = pd.DataFrame([video_data])  # Create a temporary DataFrame
        all_video_data = pd.concat([all_video_data, temp_df], ignore_index=True)  # Concatenate DataFrames

# Save the DataFrame to a CSV file
csv_filename = "youtube_video_data.csv"
all_video_data.to_csv(csv_filename, index=False)  # Set index=False to avoid saving the index column

print(f"Data saved to '{csv_filename}' successfully!") 


# further we can categorize in nano, micro , macro etc

In [None]:

from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
import pandas as pd
import os

# Your YouTube Data API v3 Key (replace with your actual API key)
api_key = 'AIzaSyBrLoDDOuyV2iPMDeVXOjHjkwTYxPKY3MA'

youtube = build('youtube', 'v3', developerKey=api_key)

def get_video_info(url):
    """
    This function takes a YouTube video URL, scrapes transcript,
    channel name, and comments using the YouTube Data API v3.

    Args:
        url (str): The URL of the YouTube video.

    Returns:
        dict: A dictionary containing the YouTuber's channel name,
              video URL, transcript, and comments. Returns None if
              transcript is not available or an error occurs.
    """

    try:
        video = YouTube(url)
        video_id = video.video_id
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatted_transcript = ' '.join([entry['text'] for entry in transcript])

        # Get channel details
        channel_response = youtube.channels().list(
            part='snippet',
            id=video.channel_id
        ).execute()
        channel_name = channel_response['items'][0]['snippet']['title']

        # Get comments
        comments = []
        results = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            maxResults=100  # Fetch up to 100 comments
        ).execute()

        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        return {
            "YouTuber": video.author,
            "Channel Name": channel_name,
            "Video URL": url,
            "Transcript": formatted_transcript,
            "Comments": comments
        }

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# --- Main Execution --- 

# CSV filename
csv_filename = "youtube_video_data.csv"

# Check if the CSV file already exists
if os.path.exists(csv_filename):
    all_video_data = pd.read_csv(csv_filename)
else:
    all_video_data = pd.DataFrame(columns=["YouTuber", "Channel Name", "Video URL", "Transcript", "Comments"])

# Get YouTube video URLs from the user 
video_urls = []
while True:
    url = input("Enter a YouTube video URL (or type 'done' to finish): ")
    if url.lower() == 'done':
        break
    video_urls.append(url)

# Iterate through video URLs, fetch data, and append to DataFrame
for video_url in video_urls:
    video_data = get_video_info(video_url)

    if video_data:
        temp_df = pd.DataFrame([video_data])  # Create a temporary DataFrame
        all_video_data = pd.concat([all_video_data, temp_df], ignore_index=True)  # Concatenate DataFrames

# Save the DataFrame to a CSV file
all_video_data.to_csv(csv_filename, index=False)  # Set index=False to avoid saving the index column

print(f"Data saved to '{csv_filename}' successfully!")