Skip to content

Commit

Permalink
Make ditto clone parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
sargunv committed Sep 18, 2018
1 parent a73d4df commit 2adb913
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 47 deletions.
2 changes: 1 addition & 1 deletion pokeapi_ditto/commands/analyze.py
Expand Up @@ -3,7 +3,7 @@
import os
import re
from pathlib import Path
from typing import List, Dict, TypeVar
from typing import Dict, List, TypeVar

from genson import SchemaBuilder
from tqdm import tqdm
Expand Down
124 changes: 81 additions & 43 deletions pokeapi_ditto/commands/clone.py
@@ -1,62 +1,100 @@
import json
import os
import os.path
from multiprocessing import Pool
from pathlib import Path
from signal import SIG_IGN, SIGINT, signal
from typing import Any, Callable, List, Tuple

import requests
from tqdm import tqdm
from yarl import URL


def do_clone(src_url: str, dest_dir: str):
if not src_url.endswith("/"):
src_url += "/"
def _do_in_parallel(worker: Callable, data: List, desc: str) -> None:
cpus = os.cpu_count()
pool = Pool(cpus, initializer=lambda: signal(SIGINT, SIG_IGN))
try:
for _ in tqdm(pool.imap_unordered(worker, data), total=len(data), desc=f"{desc} ({cpus}x)"):
pass
except KeyboardInterrupt as interrupt:
pool.terminate()
pool.join()
raise interrupt

if not dest_dir.endswith("/"):
dest_dir += "/"

def safe_open_w(file_name):
os.makedirs(os.path.dirname(file_name), exist_ok=True)
return open(file_name, "w")
class Cloner:

def print_json(data, file_name):
transformed_data = json.dumps(data, indent=4, sort_keys=True)
transformed_data = transformed_data.replace(src_url, "/")
print(transformed_data, file=safe_open_w(file_name))
_src_url: URL
_dest_dir: Path

# Root
def __init__(self, src_url: str, dest_dir: str):
if src_url.endswith("/"):
src_url = src_url[:-1]
if not dest_dir.endswith("/"):
dest_dir += "/"

url = src_url + "api/v2/"
endpoints = requests.get(url)
self._src_url = URL(src_url)
self._dest_dir = Path(dest_dir)

path = dest_dir + url.replace(src_url, "") + "index.json"
print_json(endpoints.json(), path)
def _crawl(self, url: URL, save: bool = True) -> Any:
try:
data = requests.get(url).json()
except json.JSONDecodeError as err:
tqdm.write(f"JSON decode failure: {url}")
return None

# Endpoints
if save:
out_data = json.dumps(data, indent=4, sort_keys=True)
out_data = out_data.replace(str(self._src_url), "")
file = self._dest_dir.joinpath((url / "index.json").path[1:])
file.parent.mkdir(parents=True, exist_ok=True)
file.write_text(out_data)

for endpoint in tqdm(endpoints.json().values()):
# Zero index
url = endpoint + "?limit=0"
resource_list = requests.get(url)
count = str(resource_list.json()["count"])
return data

# Full index
url = endpoint + "?limit=" + count
resource_list = requests.get(url)
endpoint_path = endpoint.replace(src_url, "")
path = dest_dir + endpoint_path + "index.json"
print_json(resource_list.json(), path)
def _crawl_index(self) -> List[URL]:
index = self._crawl(self._src_url / "api/v2")
return [URL(url_str) for url_str in index.values()]

# All resources
desc = list(filter(None, endpoint_path.split("/")))[-1]
for resourceSummary in tqdm(resource_list.json()["results"], desc=desc):
resource_url = resourceSummary["url"]
path = dest_dir + resource_url.replace(src_url, "") + "index.json"
def _crawl_resource_list(self, url: URL) -> List[URL]:
zero_url = url.with_query({"limit": 0, "offset": 0})
count = self._crawl(zero_url, save=False)["count"]
full_url = url.with_query({"limit": count, "offset": 0})
resource_list = self._crawl(full_url)
return [URL(resource_ref["url"]) for resource_ref in resource_list["results"]]

resource = requests.get(resource_url)
print_json(resource.json(), path)
def clone_single(self, endpoint_and_id: Tuple[str, str]) -> None:
endpoint, id = endpoint_and_id
res_url = self._src_url / "api/v2" / endpoint / id
self._crawl(res_url)
if endpoint == "pokemon":
self._crawl(res_url / "encounters")

if endpoint.endswith("/pokemon/"):
resource_url += "encounters/"
path = dest_dir + resource_url.replace(src_url, "") + "index.json"
if not os.path.isfile(path):
resource = requests.get(resource_url)
print_json(resource.json(), path)
def clone_endpoint(self, endpoint: str):
res_list_url = self._src_url / "api/v2" / endpoint
res_urls = self._crawl_resource_list(res_list_url)
singles = [(endpoint, url.parent.name) for url in res_urls]
_do_in_parallel(
worker=self.clone_single,
data=singles,
desc=res_list_url.name,
)

def clone_all(self) -> None:
resource_lists = self._crawl_index()
for res_list_url in tqdm(resource_lists, desc="clone"):
endpoint = res_list_url.parent.name
self.clone_endpoint(endpoint)


def do_clone(src_url: str, dest_dir: str, select: List[str]) -> None:
cloner = Cloner(src_url, dest_dir)

if not select:
cloner.clone_all()

for sel in select:
if "/" in sel:
cloner.clone_single(tuple(filter(None, sel.split("/")))[0:2])
else:
cloner.clone_endpoint(sel)
1 change: 0 additions & 1 deletion pokeapi_ditto/commands/models.py
Expand Up @@ -2,7 +2,6 @@

from odictliteral import odict


COMMON_MODELS: OrderedDict = odict[
f"/schema/v2/api_resource.json":{
"properties": {"url": {"type": "string"}},
Expand Down
1 change: 1 addition & 0 deletions pokeapi_ditto/main.py
Expand Up @@ -18,6 +18,7 @@ def __init__(self):
clone_args = subparsers.add_parser("clone")
clone_args.add_argument("--src-url", type=str, default="http://localhost/")
clone_args.add_argument("--dest-dir", type=str, default="./data")
clone_args.add_argument("--select", nargs='+', default=[])

transform_args = subparsers.add_parser("transform")
transform_args.add_argument("--src-dir", type=str, default="./data")
Expand Down
26 changes: 25 additions & 1 deletion pyproject.lock
Expand Up @@ -141,6 +141,15 @@ version = "4.3.0"
[package.dependencies]
six = ">=1.0.0,<2.0.0"

[[package]]
category = "main"
description = "multidict implementation"
name = "multidict"
optional = false
platform = "*"
python-versions = ">=3.4.1"
version = "4.4.1"

[[package]]
category = "main"
description = "A tidier way of coding literal OrderedDicts"
Expand Down Expand Up @@ -259,8 +268,21 @@ platform = "*"
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
version = "1.23"

[[package]]
category = "main"
description = "Yet another URL library"
name = "yarl"
optional = false
platform = "*"
python-versions = ">=3.5.3"
version = "1.2.6"

[package.dependencies]
idna = ">=2.0"
multidict = ">=4.0"

[metadata]
content-hash = "ff522e2cdda2435753106f9e21b08854c6846d5a6a221f0f0a1c3647e4818f4c"
content-hash = "c12dfe2c2c968ef9d9e8bd8de2297cd274b51957fd77992d54088db9a29093a2"
platform = "*"
python-versions = "^3.6"

Expand All @@ -279,6 +301,7 @@ idna = ["156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", "684
isort = ["1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", "b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", "ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"]
mccabe = ["ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", "dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"]
more-itertools = ["c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092", "c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e", "fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"]
multidict = ["3be539df400562f0e6c2089dd0b512150abd41189723405097ef1d11fd165658", "4ec2ba081c710e8ddf12adf9867ea8969e12aa8f0d0cd8270c706538a622aecb", "5d93048a352bf1318615a9d483a17c87f2f0f8ca2be0157ff353753de73e2636", "728222ed41e57ef01b41282df7b15b80c3da7fb0e664c8898d2efb4d969ee7e8", "7bbef78bf0b9b038f97e172a1a861e6c1a109be69592ed4ac0f2e9a23ec1fbd2", "80fd036a56bcb9dd9ba63a55b884b67fa18b46b02400528cb4bd3e2cc98791fe", "833fb16b7da8437ae067675e6e0a22cf0d5935d578fadd2a8474be702ade2cae", "9cee5290ec55d17917fcb71bab853cc376c3e30d45a21d96dca106f72a46b357", "b75218d23692e4e8ade6b7a1dd2de21ef0342e83057487b69e32732e93d5f769", "c0be167d860667125d397152192c4bb084e91a07130a90aef7f927b23f73120f", "c17d317dab455354ed6f95c42df84cc34b4bd9cace121fd4ee88307a16cc2482", "d5432a55ded1d1df572960f0a4d87fe771d4238729392b8ca995e0e50021ae8f", "d75a539aae854e19c79d39aa88d235c3eeead2cc37cdcb6c3fb193e4b0ba0b78", "d8fd869eec069dd45831486cc074ad9f82051f317371f9e1e11ef83668e23bde", "dc043e177e4bbc2539203af61e9b31c5f5de468df661ed52d9fa13bc868a1ce7", "e1ab80f1aa32f97c1cbbfb47725292617a443538103a0c5d5a419eb9629c7415", "e2e1e4b34ecf7f752594836ca338457945c2c1d4f928dd10e7c2319669af6c4e", "e9bceec13f5ea83fc434daa703c22f7263a1a7d3d9d4d53330b103cc3cfa875d", "f2d6d48932154807f6dddd093f6de0ef75356d330dda9df68c5106ccea8dda48", "fe79338660331d3a4a6f5326300d0c0e0b5f47edfe32e1a0626aa7b2e2bf08c4", "febbbf93912fdbe9455ac1673284df2f4561c5f075ef72aff90b445788feda96"]
odictliteral = ["88405c7fab7ff7a54c7b9fac9fd69264e526b0024b8265bc042ba3a797f0c161"]
pluggy = ["6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", "95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"]
py = ["06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", "50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"]
Expand All @@ -290,3 +313,4 @@ six = ["70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", "832d
toml = ["380178cde50a6a79f9d2cf6f42a62a5174febe5eea4126fe4038785f1d888d42", "a7901919d3e4f92ffba7ff40a9d697e35bbbc8a8049fe8da742f34c83606d957"]
tqdm = ["18f1818ce951aeb9ea162ae1098b43f583f7d057b34d706f66939353d1208889", "df02c0650160986bac0218bb07952245fc6960d23654648b5d5526ad5a4128c9"]
urllib3 = ["a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"]
yarl = ["2556b779125621b311844a072e0ed367e8409a18fa12cbd68eb1258d187820f9", "4aec0769f1799a9d4496827292c02a7b1f75c0bab56ab2b60dd94ebb57cbd5ee", "55369d95afaacf2fa6b49c84d18b51f1704a6560c432a0f9a1aeb23f7b971308", "6c098b85442c8fe3303e708bbb775afd0f6b29f77612e8892627bcab4b939357", "9182cd6f93412d32e009020a44d6d170d2093646464a88aeec2aef50592f8c78", "c8cbc21bbfa1dd7d5386d48cc814fe3d35b80f60299cdde9279046f399c3b0d8", "db6f70a4b09cde813a4807843abaaa60f3b15fb4a2a06f9ae9c311472662daa1", "f17495e6fe3d377e3faac68121caef6f974fcb9e046bc075bcff40d8e5cc69a4", "f85900b9cca0c67767bb61b2b9bd53208aaa7373dae633dbe25d179b4bf38aa7"]
3 changes: 2 additions & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pokeapi-ditto"
version = "0.4.0"
version = "0.5.0"
description = "Ditto is a server that serves a static copy of PokeAPI's data."
license = "Apache-2.0"
authors = ["Sargun Vohra <sargun.vohra@gmail.com>"]
Expand All @@ -27,6 +27,7 @@ requests = "^2.19"
genson = "^1.0"
tqdm = "^4.26"
odictliteral = "^1.0"
yarl = "^1.2"

[tool.poetry.dev-dependencies]
pytest = "^3.0"
Expand Down

0 comments on commit 2adb913

Please sign in to comment.