In [1]:
import requests
import datetime
import xml.etree.ElementTree as ET
import os

In [2]:
# Define some useful variables/constants
DATE_STRING = '%a, %d %b %Y %H:%M:%S %z'
TODAY_DAY_OF_YEAR = int(datetime.datetime.now().strftime('%j'))
SEASON_SUBTRACT = 1951
VIDEO_DIRECTORY = '/home/xander/gdrive/Tagesschau'

In [3]:
def current_season(time):
    return time.year - SEASON_SUBTRACT

In [4]:
# Retrieve the current feed
url = 'https://www.tagesschau.de/export/video-podcast/webxl/tagesschau_https/'
feed = requests.get(url)
open('feed.xml', 'wb').write(feed.content)

36589

In [5]:
# Parse the XML into a tree
tree = ET.parse('feed.xml')
root = tree.getroot()

In [6]:
# Based on https://stackoverflow.com/a/68082847/2378368
# Parses a given ElementTree into a dictionary
def etree_to_dict(t):
    if type(t) is ET.ElementTree: return etree_to_dict(t.getroot())
    return {
        **t.attrib,
        'text': t.text,
        **{e.tag: etree_to_dict(e) for e in t}
    }

In [7]:
def get_season_dir(season, root_dir=VIDEO_DIRECTORY):
    season_dir_name = 'Season ' + str(season)
    season_dir_path = os.path.join(root_dir, season_dir_name)
    if season_dir_name not in os.listdir(root_dir):
        # The directory for this season does not exist yet; create it
        os.mkdir(season_dir_path)
    return season_dir_path

In [8]:
def episode_exists(episode_number, season_number, season_dir):
    filename_start = 'Tagesschau - s' + str(season_number) + 'e' + str(episode_number)
    # Go over all files
    for file in os.listdir(season_dir):
        if filename_start in file:
            return True
    # If the current episode is not found, it does not exist yet
    return False

In [9]:
# Processes a single episode
def process_episode(episode):
    time = datetime.datetime.strptime(episode['pubDate']['text'], DATE_STRING)
    # Compute the episode number (a.k.a. the day of the year)
    day_of_year = int(time.strftime('%j'))
    season = current_season(time)
    current_season_dir = get_season_dir(season, VIDEO_DIRECTORY)
    if not episode_exists(day_of_year, season, current_season_dir):
        # Download this episode and store it
        episode_url = episode['enclosure']['url']
        file = open(os.path.join(current_season_dir, 'Tagesschau - s' + str(season) + 'e' + str(day_of_year) + '.mp4'), 'wb')
        file.write(requests.get(episode_url).content)
        file.close()

In [10]:
# Parse the XML tree and retrieve the epsiodes for further processing
for channel in root:
    for child in channel:
        if child.tag == 'item':
            # This is an episode/item
            process_episode(etree_to_dict(child))

In [None]:
# Use rclone to move files to Google Drive. Requires setup of sudo such that no password will be requested.

os.system('sudo /usr/bin/rclone move /mnt/gdrive-local gdrive: --config /home/xander/.config/rclone/rclone.conf --log-file /opt/rclone/logs/upload.log --log-level INFO --delete-empty-src-dirs --fast-list --min-age 10s')