In [10]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [14]:
import os
import json
from googleapiclient.discovery import build
import isodate
import google.generativeai as genai

API_KEY = ''
GEMINI_API_KEY = ''

youtube = build('youtube', 'v3', developerKey=API_KEY)
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [18]:
from isodate import parse_duration


In [24]:
def search_videos(language_code, query, max_duration=72000):
    video_links = []
    total_duration = 0
    next_page_token = None
    collected_channels = set()

    industries = [
        "Telecom", "Automotive", "Government", "Financial Services",
        "Tourism & Aviation", "Healthcare", "Technology", "Real Estate",
        "Retail", "Transportation"
    ]

    while total_duration < max_duration:
        search_response = youtube.search().list(
            q=query,
            part='id,snippet',
            type='video',
            videoDuration='medium',
            videoCaption='closedCaption',
            relevanceLanguage=language_code,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for search_result in search_response.get('items', []):
            video_id = search_result['id']['videoId']
            channel_id = search_result['snippet']['channelId']
            description = search_result['snippet']['description']

            if channel_id in collected_channels:
                continue

            prompt = f"""
            Analyze the following YouTube video description and determine if it's related to any of these industries: {', '.join(industries)}. 
            If it's an interview or podcast in these industries, that's also acceptable.
            Respond with only the industry name if it matches, or 'None' if it doesn't match any.

            Description: {description}
            """

            try:
                response = model.generate_content(prompt)
                
                
                if response.parts:
                    industry = response.text.strip()
                else:
                    
                    print(f"No content generated for video {video_id}. Skipping.")
                    continue
            except Exception as e:
                
                print(f"Error processing video {video_id}: {str(e)}")
                continue

            if industry != 'None':
                video_details = youtube.videos().list(
                    part='contentDetails',
                    id=video_id
                ).execute()

                duration = video_details['items'][0]['contentDetails']['duration']
                duration_timedelta = parse_duration(duration)
                duration_seconds = duration_timedelta.total_seconds()

                if total_duration + duration_seconds <= max_duration:
                    video_links.append({
                        'url': f"https://www.youtube.com/watch?v={video_id}",
                        'industry': industry
                    })
                    total_duration += duration_seconds
                    collected_channels.add(channel_id)

            if total_duration >= max_duration:
                break

        next_page_token = search_response.get('nextPageToken')
        if not next_page_token:
            break

    return video_links

In [25]:
languages = {
    'italian': 'it',
    'french': 'fr',
    'german': 'de'
}

queries = {
    'italian': 'intervista o podcast o vlog in italiano',
    'french': 'interview ou podcast ou vlog en français',
    'german': 'Interview oder Podcast oder Vlog auf Deutsch'
}

for language, lang_code in languages.items():
    links = search_videos(lang_code, queries[language])
    with open(f'{language}_industry_videos.json', 'w') as file:
        json.dump(links, file)
    print(f"Collected {len(links)} links for {language}.")

No content generated for video m354NWfktzk. Skipping.
No content generated for video x3ZnWL-57xA. Skipping.
No content generated for video CQ3UevU6UrI. Skipping.
No content generated for video BWhjp7E94Qw. Skipping.
No content generated for video ooozTD216g8. Skipping.
No content generated for video 7NDDCpowS7U. Skipping.
Collected 45 links for italian.
No content generated for video TMwSMHAXXv4. Skipping.
Collected 123 links for french.
No content generated for video Blg2lGK-uUY. Skipping.
No content generated for video 5cm9ELLR8ds. Skipping.
No content generated for video UcrR4EtEiJQ. Skipping.
No content generated for video uDjsbwbYapE. Skipping.
No content generated for video ccUMwBkenuk. Skipping.
Collected 61 links for german.
