In [11]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
from urllib.parse import parse_qs, urlparse
import requests
import os
from dotenv import load_dotenv

load_dotenv()

True

In [12]:
def get_playlist_video_ids(playlist_url):
    """Extract all video IDs from a YouTube playlist"""
    # Extract playlist ID
    parsed_url = urlparse(playlist_url)
    query_params = parse_qs(parsed_url.query)
    playlist_id = query_params.get('list', [None])[0]
    
    if not playlist_id:
        raise ValueError("Invalid playlist URL")
    
    # Get API key from environment variable
    API_KEY = os.getenv('YOUTUBE_API_KEY')
    if not API_KEY:
        raise ValueError("YOUTUBE_API_KEY environment variable not found")
    
    video_ids = []
    next_page_token = None
    
    while True:
        url = f"https://www.googleapis.com/youtube/v3/playlistItems"
        params = {
            'part': 'contentDetails',
            'playlistId': playlist_id,
            'maxResults': 50,
            'key': API_KEY
        }
        
        if next_page_token:
            params['pageToken'] = next_page_token
        
        try:
            response = requests.get(url, params=params).json()
            
            if 'error' in response:
                raise ValueError(f"YouTube API error: {response['error']['message']}")
                
            # Extract video IDs
            for item in response.get('items', []):
                video_ids.append(item['contentDetails']['videoId'])
            
            # Check if there are more pages
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
                
        except Exception as e:
            print(f"Error fetching playlist data: {e}")
            break
    
    return video_ids

In [13]:
def get_video_id(url):
    """Extract video ID from YouTube URL"""
    video_id = url.split('v=')[-1]
    ampersand_pos = video_id.find('&')
    if ampersand_pos != -1:
        video_id = video_id[:ampersand_pos]
    return video_id


def get_transcript_chunks(url, max_chars=100):
    """Get transcript and break it into chunks"""
    video_id = get_video_id(url)

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"Error getting transcript: {e}")
        return None

    # Combine transcript text
    full_text = ' '.join(item['text'] for item in transcript)

    # Split into sentences
    sentences = re.split('[.!?]+', full_text)
    sentences = [s.strip() for s in sentences if s.strip()]

    # Create chunks of sentences
    chunks = []
    current_chunk = sentences[0]

    for sentence in sentences[1:]:
        if len(current_chunk + '. ' + sentence) <= max_chars:
            current_chunk += '. ' + sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence

    chunks.append(current_chunk)  # Add the last chunk

    # Create DataFrame
    df = pd.DataFrame({
        'person': ['Bob Ross'] * len(chunks),
        'text': chunks
    })

    return df

In [3]:
url = "https://www.youtube.com/watch?v=DFSIQNjKRfk&list=PLAEQD0ULngi69x_7JbQvSMprLRK_KSVLu&index=5&ab_channel=BobRoss"
df = get_transcript_chunks(url)
if df is not None:
    print(df)

       person                                               text
0    Bob Ross  - Well welcome back, glad\nto see you again today
1    Bob Ross  I thought today we'd do\nsomething a little di...
2    Bob Ross  And to that I'm gonna add\na little Prussian B...
3    Bob Ross  And we just start at\nthe bottom and work up s...
4    Bob Ross                                        There we go
..        ...                                                ...
198  Bob Ross  Fantastic painting. Okay. We'll just sign this...
199  Bob Ross              And we'll call that painting finished
200  Bob Ross  Once again, I hope you've\npainted along with ...
201  Bob Ross  And we hope to see you again next time, and we...
202  Bob Ross  Until then, from all of\nus here, happy painti...

[203 rows x 2 columns]


In [4]:
def clean_text(text):
    """
    Clean text by removing newlines, extra spaces, and special characters
    Args:
        text (str): Input text to clean
    Returns:
        str: Cleaned text
    """
    # Remove newlines
    text = text.replace('\n', ' ')
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    
    # Remove multiple periods/ellipsis
    text = re.sub(r'\.{2,}', '.', text)
    
    # Remove extra spaces around punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    
    return text.strip()

In [5]:
# Apply the cleaning function to the 'text' column
df['text'] = df['text'].apply(clean_text)

# Display the first few rows to verify the cleaning
print(df.head())

     person                                               text
0  Bob Ross   - Well welcome back, glad to see you again today
1  Bob Ross  I thought today wed do something a little diff...
2  Bob Ross  And to that Im gonna add a little Prussian Blu...
3  Bob Ross  And we just start at the bottom and work up so...
4  Bob Ross                                        There we go


In [10]:
df.iloc[5]['text']

'I thought today wed make a happy little stream, its just sort of running through the woods here'