# Stažení všech tabulatur do JSON

## Importy

In [None]:
import waybackpy
import re
from bs4 import BeautifulSoup
import time
from IPython.display import clear_output
import os
import json
from waybackpy.exceptions import WaybackError

## Nastavení

**MAX_YEAR**: nejvišší rok snapshotu

**MIN_YEAR**: nejnižší rok snapshotu

**API_ERROR_DELAY**: počet vteřin, které se počká, než se zkusí druhý pokus o stažení

In [None]:
MAX_YEAR = 2018
MIN_YEAR = 2012
API_ERROR_DELAY = 5

## Stáhnout všechny uložené URL z kytarovenebe.cz

In [None]:
known_urls = list(waybackpy.Url(url="kytarovenebe.cz").known_urls(subdomain=False))

known_urls[0:5]

## Vyfiltrovat URL tabulatur

In [None]:
re_tab = re.compile(r"ukaz-tabulaturu\.php\?id=")

tab_urls = list(filter(lambda url: re_tab.search(url), known_urls))

tab_urls = list(dict.fromkeys(tab_urls))

tab_urls[0:5]

## Získá nejnovější snapshot v rozmezí MAX_YEAR a MIN_YEAR

In [None]:
def get_latest(tab_url, minus_year = 0):
  try_year = MAX_YEAR - minus_year

  if try_year < MIN_YEAR:
    return None

  archive_url = waybackpy.Url(url=tab_url).near(year=try_year)
  print(" => {}".format(archive_url))
  

  if archive_url.timestamp.year > MAX_YEAR:
    return get_latest(tab_url, archive_url.timestamp.year - try_year)
  else:
    return archive_url
  

## Stáhne a parsuje tabulaturu

In [None]:
def download_tab(tab_url, retry=False):

    tab_id = re.search(r"id=(\d+)", tab_url).group(1)
    filename = "output/json/{}.json".format(tab_id)

    if os.path.exists(filename):
        return

    try:

        archive_url = get_latest(tab_url)

        if archive_url == None:
            print(" => NO DATA")
            f = open(filename, "w", encoding="utf-8")
            f.write("")
            f.close()
            return

        html = archive_url.get()

        soup = BeautifulSoup(html, "html.parser")

        data = {
            "id": tab_id,
            "title": soup.find("h1").text,
            "tab": soup.find(class_="tabshow").text,
        }

        print(" => ID: {}".format(data["id"]))
        print(" => Title: {}".format(data["title"]))
        print(" => File: {}".format(filename))

        f = open(filename, "w", encoding="utf-8")
        f.write(json.dumps(data))
        f.close()

        print(" => Done.")

    except WaybackError:
        if retry == False:
            print(" => API ERROR, retry in {} sec".format(API_ERROR_DELAY))
            time.sleep(API_ERROR_DELAY)
            print(" => Retrying...")
            download_tab(tab_url, True)
        else:
            f = open("errors.log", "a", encoding="utf-8")
            f.write(tab_url + "\n")
            f.close()
            return

    except Exception:
        f = open("errors.log", "a", encoding="utf-8")
        f.write(tab_url + "\n")
        f.close()


## Stahování tabulatur

In [None]:
f = open("errors.log", "w", encoding="utf-8")
f.write("")
f.close()

for i, tab_url in enumerate(tab_urls):

    clear_output(wait=True)

    percent = round(((i + 1) / len(tab_urls)) * 100, 2)
    print("{}% ({}/{}) {}".format(percent, i, len(tab_urls), tab_url))
    
    download_tab(tab_url)
    
clear_output()

print("Finished.")
