forked from i-am-alice/2nd-devs
-
Notifications
You must be signed in to change notification settings - Fork 3
/
30.py
76 lines (63 loc) · 2.72 KB
/
30.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import asyncio
import aiohttp
import json
import xmltodict
from youtube_transcript_api import YouTubeTranscriptApi
# --------------------------------------------------------------
# Get XML about each video in channel and convert into json
# --------------------------------------------------------------
channels = ["UC_MIaHmSkt9JHNZfQ_gUmrg", "UCTTZqMWBvLsUYqYwKTdjvkw", "UCRHXKLPXE-hYh0biKr2DGIg"]
async def get_videos_data_from_channel(session, channel_id):
async with session.get(f'https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}') as response:
xml = await response.text()
json_data = xmltodict.parse(xml)
return create_list_of_videos_from_json_data(json_data, channel_id)
# --------------------------------------------------------------
# Extract data from json_data to list of videos
# --------------------------------------------------------------
def create_list_of_videos_from_json_data(data, channel_id):
feed = data['feed']
entries = feed['entry']
videos = []
for entry in entries:
id = entry['yt:videoId']
title = entry['title']
url = entry['link']['@href']
thumbnail = entry['media:group']['media:thumbnail']['@url']
description = entry['media:group']['media:description']
video = {
'id': id,
'title': title,
'thumbnail': thumbnail,
'description': description,
'url': url,
'channelId': channel_id,
'channel': f'https://www.youtube.com/channel/{channel_id}'
}
videos.append(video)
return videos
# --------------------------------------------------------------
# Get video transcription if exists
# --------------------------------------------------------------
async def get_video_transcription(video):
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video['id'])
transcript = transcript_list.find_transcript(['pl']).fetch()
video['transcription'] = transcript
except:
video['transcription'] = ''
return video
# --------------------------------------------------------------
# Execute the code
# --------------------------------------------------------------
async with aiohttp.ClientSession() as session:
video_lists = await asyncio.gather(*[get_videos_data_from_channel(session, channel_id) for channel_id in channels])
# Select only 3 first video from each channel
videos = []
for channel in video_lists:
for video in channel[0:3]:
videos.append(video)
transcripts = await asyncio.gather(*[get_video_transcription(video) for video in videos])
print(videos)
with open('videos.json', 'w') as f:
json.dump(videos, f, indent=4)