In [18]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
from urllib.parse import parse_qs, urlparse
import requests
import os
from dotenv import load_dotenv

load_dotenv()

True

In [29]:
def get_playlist_video_ids(playlist_url):
    """Extract all video IDs from a YouTube playlist"""
    # Extract playlist ID
    parsed_url = urlparse(playlist_url)
    query_params = parse_qs(parsed_url.query)
    playlist_id = query_params.get('list', [None])[0]
    
    if not playlist_id:
        raise ValueError("Invalid playlist URL")
    
    # Get API key from environment variable
    API_KEY = os.getenv('YOUTUBE_API_KEY')
    if not API_KEY:
        raise ValueError("YOUTUBE_API_KEY environment variable not found")
    
    video_ids = []
    next_page_token = None
    
    while True:
        url = f"https://www.googleapis.com/youtube/v3/playlistItems"
        params = {
            'part': 'contentDetails',
            'playlistId': playlist_id,
            'maxResults': 50,
            'key': API_KEY
        }
        
        if next_page_token:
            params['pageToken'] = next_page_token
        
        try:
            response = requests.get(url, params=params).json()
            
            if 'error' in response:
                raise ValueError(f"YouTube API error: {response['error']['message']}")
                
            # Extract video IDs
            for item in response.get('items', []):
                video_ids.append(item['contentDetails']['videoId'])
            
            # Check if there are more pages
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
                
        except Exception as e:
            print(f"Error fetching playlist data: {e}")
            break
    
    return video_ids

def get_transcript_chunks(video_id, max_chars=200):  # Changed default to 200
    """
    Get transcript and break it into chunks of maximum 200 characters
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"Error getting transcript for video {video_id}: {e}")
        return None

    # Combine transcript text
    full_text = ' '.join(item['text'] for item in transcript)
    
    # Split into sentences
    sentences = re.split('[.!?]+', full_text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Create chunks of sentences
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # If current sentence alone is longer than max_chars, split it into words
        if len(sentence) > max_chars:
            words = sentence.split()
            temp_chunk = ""
            for word in words:
                if len(temp_chunk + " " + word) <= max_chars:
                    temp_chunk += " " + word if temp_chunk else word
                else:
                    chunks.append(temp_chunk.strip())
                    temp_chunk = word
            if temp_chunk:
                current_chunk = temp_chunk
            continue

        # Try to add the sentence to current chunk
        test_chunk = current_chunk + ". " + sentence if current_chunk else sentence
        
        if len(test_chunk) <= max_chars:
            current_chunk = test_chunk
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def clean_text(text):
    """Clean text by removing newlines and special characters"""
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    return text.strip()

def process_playlist(playlist_url):
    """Process all videos in a playlist and create a DataFrame"""
    all_chunks = []
    video_ids = get_playlist_video_ids(playlist_url)
    
    print(f"Found {len(video_ids)} videos in playlist")
    
    for i, video_id in enumerate(video_ids, 1):
        print(f"Processing video {i}/{len(video_ids)} (ID: {video_id})")
        chunks = get_transcript_chunks(video_id)
        if chunks:
            all_chunks.extend(chunks)
        else:
            print(f"No transcript available for video {video_id}")
    
    print(f"Total chunks extracted: {len(all_chunks)}")
    
    # Create DataFrame
    df = pd.DataFrame({
        'person': ['Bob Ross'] * len(all_chunks),
        'text': all_chunks
    })
    
    # Clean the text
    df['text'] = df['text'].apply(clean_text)
    
    return df

def save_to_csv(df, output_file="transcript_chunks.csv"):
    """Save the DataFrame to a CSV file"""
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

In [50]:
season_one_playlist_url = "https://www.youtube.com/watch?v=DFSIQNjKRfk&list=PLAEQD0ULngi69x_7JbQvSMprLRK_KSVLu&index=5&ab_channel=BobRoss"
season_8_playlist_url = "https://www.youtube.com/watch?v=cC5ozePVKGI&list=PLAEQD0ULngi7_Td-kv4YRaDwJUpUuz0WR&ab_channel=BobRoss"
season_14_playlist_url = "https://www.youtube.com/watch?v=GpA9UM7QGag&list=PLAEQD0ULngi4tDLpPnT7XV0hzcIKMZLlP&ab_channel=BobRoss"
season_18_playlist_url = "https://www.youtube.com/watch?v=uY3fIry2tOE&list=PLAEQD0ULngi79FbgDR5HQURtzgXlRUfYa&ab_channel=BobRoss"
season_20_playlist_url = "https://www.youtube.com/watch?v=VlucWfTUo1A&list=PLAEQD0ULngi7-jK4pdhsSiu5CC0ojRqmM&ab_channel=BobRoss"
season_22_playlist_url = "https://www.youtube.com/watch?v=HMx34Am6RFg&list=PLAEQD0ULngi5b8jcMLQ003OV5C2qUNeFE&ab_channel=BobRoss"
season_27_playlist_url = "https://www.youtube.com/watch?v=0mJqzzeWyXs&list=PLAEQD0ULngi6J8P64GAMRZSzucIru0rMG&ab_channel=BobRoss"
season_31_playlist_url = "https://www.youtube.com/watch?v=kJFB6rH3z2A&list=PLAEQD0ULngi5PAjhOL-GfvbcQDn2Hujoj&ab_channel=BobRoss"

In [51]:
df = process_playlist(season_18_playlist_url)
df

Found 13 videos in playlist
Processing video 1/13 (ID: uY3fIry2tOE)
Processing video 2/13 (ID: PGPVpil2UmE)
Processing video 3/13 (ID: EBZKuVbRY54)
Processing video 4/13 (ID: R7Y3izMFPbM)
Processing video 5/13 (ID: 6afHY2d9Lv8)
Processing video 6/13 (ID: lilbzLCNnDo)
Processing video 7/13 (ID: rCHXqj4DHlM)
Processing video 8/13 (ID: WJF_qoQRPck)
Processing video 9/13 (ID: sBBBilrDuSw)
Processing video 10/13 (ID: rRjnHdr9DmU)
Processing video 11/13 (ID: ikR7UT9mVBw)
Processing video 12/13 (ID: EVfPPJ5FUmA)
Processing video 13/13 (ID: XvnJBynSiT0)
Total chunks extracted: 1695


Unnamed: 0,person,text
0,Bob Ross,"And if this is your first time with us, allow ..."
1,Bob Ross,"masterpieces on canvas. Tell you what, Im gonn..."
2,Bob Ross,"And while theyre doing that, let me tell you w..."
3,Bob Ross,Today I have my standard old 18 by 24 inch dou...
4,Bob Ross,And weve covered the entire canvas with a thin...
...,...,...
1690,Bob Ross,And I thank you from the bottom of my heart fo...
1691,Bob Ross,"So if youll keep watching us, well keep right ..."
1692,Bob Ross,"And until the next series, Id like to wish you..."
1693,Bob Ross,smooth jazz music - Voiceover This program is ...


In [52]:
save_to_csv(df, output_file="data/bob_ross/bob_ross_season_18.csv")

Data saved to data/bob_ross_season_18.csv
