Skip to content

Commit

Permalink
uplift documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Marco Fossati committed Jul 11, 2019
1 parent 3cdcef8 commit 4bb7d52
Show file tree
Hide file tree
Showing 12 changed files with 162 additions and 79 deletions.
2 changes: 1 addition & 1 deletion soweego/importer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"""Importer module - Generic service for dump updating/importing"""
"""Import target catalog dumps into a SQL database."""
36 changes: 20 additions & 16 deletions soweego/importer/base_dump_extractor.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Dump extractor abstract class"""
"""Abstract class for catalog dumps extraction."""

import logging
import warnings
from typing import Iterable
from typing import List, Optional

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
Expand All @@ -13,31 +14,34 @@
__copyright__ = 'Copyleft 2018, Hjfocs'

LOGGER = logging.getLogger(__name__)

# Silence full-text index creation warning
warnings.filterwarnings(
"ignore", message=".*rebuilding table to add column FTS_DOC_ID.*"
'ignore', message='.*rebuilding table to add column FTS_DOC_ID.*'
)


class BaseDumpExtractor:
"""Defines where to download a certain dump and how to post-process it."""
"""Method definitions to download catalog dumps, extract data, and
populate a database instance.
"""

def extract_and_populate(
self, dump_file_paths: Iterable[str], resolve: bool
):
"""Extract relevant data and populate SQL Alchemy entities accordingly.
:param dump_file_paths: Iterable of paths where downloaded dumps are
placed.
:param resolve: Tells if the system will resolve the urls to validate
them.
self, dump_file_paths: List[str], resolve: bool
) -> None:
"""Extract relevant data and populate
`SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities accordingly.
Entities will be then persisted to a database instance.
:param dump_file_paths: paths to downloaded catalog dumps
:param resolve: whether to resolve URLs found in catalog dumps or not
"""
raise NotImplementedError

def get_dump_download_urls(self) -> Iterable[str]:
"""Get the dump download URL.
Useful if there is a way to compute the latest dump URL.
def get_dump_download_urls(self) -> Optional[List[str]]:
"""Get the dump download URLs of a target catalog.
Useful if there is a way to compute the latest URLs.
:return: the latest dumps URL
:rtype: Iterable[str]
"""
raise NotImplementedError
4 changes: 2 additions & 2 deletions soweego/importer/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Click-command descriptions for the importer"""
"""Importer CLI commands."""

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
Expand All @@ -19,4 +19,4 @@
@click.group(name='importer', commands=CLI_COMMANDS)
@click.pass_context
def cli(_):
"""Import target catalog dumps into the database."""
"""Import target catalog dumps into a SQL database."""
28 changes: 20 additions & 8 deletions soweego/importer/discogs_dump_extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Discogs dump extractor"""
"""`Discogs <https://www.discogs.com/>`_ dump extractor."""

__author__ = 'Marco Fossati'
__email__ = 'fossati@spaziodati.eu'
Expand All @@ -15,7 +15,7 @@
import shutil
import xml.etree.ElementTree as et
from datetime import date, datetime
from typing import Iterable, Tuple
from typing import Iterable, Tuple, List, Optional

from lxml import etree
from requests import get
Expand Down Expand Up @@ -44,7 +44,8 @@


class DiscogsDumpExtractor(BaseDumpExtractor):
"""Defines where to download Discogs dump and how to post-process it"""
"""Download Discogs dumps, extract data, and
populate a database instance."""

# Counters
total_entities = 0
Expand All @@ -59,7 +60,7 @@ class DiscogsDumpExtractor(BaseDumpExtractor):

_sqlalchemy_commit_every = 100_000

def get_dump_download_urls(self) -> Iterable[str]:
def get_dump_download_urls(self) -> Optional[List[str]]:
urls = []
response = get(DUMP_LIST_URL_TEMPLATE.format(date.today().year))
root = et.fromstring(response.text)
Expand All @@ -76,15 +77,26 @@ def get_dump_download_urls(self) -> Iterable[str]:
break
if not urls:
LOGGER.error(
"""Failed to get the Discogs dump download URL: are we at the
very start of the year?"""
'Failed to get the Discogs dump download URL: are we at the '
'very start of the year?'
)
return None
return urls

def extract_and_populate(
self, dump_file_paths: Iterable[str], resolve: bool
):
self, dump_file_paths: List[str], resolve: bool
) -> None:
"""Extract relevant data from the *artists* (people)
and *masters* (works) Discogs dumps, preprocess them, populate
`SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
them to a database instance.
See :mod:`~soweego.importer.models.discogs_entity`
for the ORM definitions.
:param dump_file_paths: paths to downloaded catalog dumps
:param resolve: whether to resolve URLs found in catalog dumps or not
"""
self._process_artists_dump(dump_file_paths[0], resolve)
self._process_masters_dump(dump_file_paths[1])

Expand Down
31 changes: 13 additions & 18 deletions soweego/importer/imdb_dump_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""IMDB dump extractor"""
"""`IMDb <https://www.imdb.com/>`_ dump extractor."""

__author__ = 'Andrea Tupini'
__email__ = 'tupini07@gmail.com'
__version__ = '1.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, tupini07'
__copyright__ = 'Copyleft 2019, tupini07'

import copy
import csv
Expand All @@ -31,7 +31,8 @@


class ImdbDumpExtractor(BaseDumpExtractor):
"""Defines where to download Imdb dump and how to post-process it"""
"""Download IMDb dumps, extract data, and
populate a database instance."""

# Counters
n_actors = 0
Expand All @@ -47,17 +48,11 @@ class ImdbDumpExtractor(BaseDumpExtractor):
_sqlalchemy_commit_every = 100_000

def get_dump_download_urls(self) -> List[str]:
"""
:return: the urls from which to download the data dumps
the first URL is the one for the **person dump**, the
second downloads the **movie dump**
"""
return [DUMP_URL_PERSON_INFO, DUMP_URL_MOVIE_INFO]

@staticmethod
def _normalize_null(entity: Dict) -> None:
"""
IMDB represents a null entry with \\N , this method converts
"""IMDb represents a null entry with \\N , this method converts
all \\N to None so that they're saved as null in the database.
This is done for all 'entries' of a given entity.
Expand All @@ -74,16 +69,16 @@ def _normalize_null(entity: Dict) -> None:
def extract_and_populate(
self, dump_file_paths: List[str], resolve: bool
) -> None:
"""
Extracts the data in the dumps (person and movie) and processes them.
It then proceeds to add the appropriate data to the database.
"""Extract relevant data from the *name* (people) and *title* (works)
IMDb dumps, preprocess them, populate
`SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
them to a database instance.
See
:ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy
definition of the entities we use to save IMDB data.
See :mod:`~soweego.importer.models.imdb_entity`
for the ORM definitions.
:param dump_file_paths: the absolute paths of the already downloaded
dump files.
:param dump_file_paths: paths to downloaded catalog dumps
:param resolve: whether to resolve URLs found in catalog dumps or not
"""

# the order of these files is specified in `self.get_dump_download_urls`
Expand Down
38 changes: 18 additions & 20 deletions soweego/importer/importer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Generic service for dump updating/importing"""
"""Download, extract, and import a supported catalog."""

__author__ = 'Massimo Frasson'
__email__ = 'maxfrax@gmail.com'
__version__ = '1.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, Massimo Frasson'
__copyright__ = 'Copyleft 2018, MaxFrax96'

import datetime
import logging
Expand Down Expand Up @@ -42,15 +42,17 @@
@click.option(
'--url-check',
is_flag=True,
help='Check for rotten URLs while importing. Default: no.'
'WARNING: this will dramatically increase the import time.',
help=(
'Check for rotten URLs while importing. Default: no. '
'WARNING: this will dramatically increase the import time.'
),
)
@click.option(
'-d',
'--dir-io',
type=click.Path(file_okay=False),
default=constants.SHARED_FOLDER,
help=f"Input/output directory," f"default: '{constants.SHARED_FOLDER}'.",
help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
)
def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
"""Download, extract, and import a supported catalog."""
Expand All @@ -69,11 +71,7 @@ def _resolve_url(res):
'catalog', type=click.Choice(target_database.supported_targets())
)
def check_links_cli(catalog: str):
"""
Check for rotten URLs of an imported catalog.
:param catalog: one of the keys of constants.TARGET_CATALOGS
"""
"""Check for rotten URLs of an imported catalog."""
for entity_type in target_database.supported_entities_for_target(catalog):

LOGGER.info("Validating %s %s links...", catalog, entity_type)
Expand Down Expand Up @@ -116,19 +114,19 @@ def check_links_cli(catalog: str):


class Importer:
"""Downloads the latest dump of a certain target"""
"""Handle a catalog dump: check its freshness and dispatch the appropriate
extractor."""

def refresh_dump(
self, output_folder: str, extractor: BaseDumpExtractor, resolve: bool
):
"""
Downloads the dump, if necessary, and calls the handler over the dump
file.
:param output_folder: folder in which the downloaded dumps will be
stored
:param extractor: BaseDumpExtractor implementation to process the dump
:param resolve: try to resolve each url in the dump to check if it
works?
"""Eventually download the latest dump, and call the
corresponding extractor.
:param output_folder: a path where the downloaded dumps will be stored
:param extractor: :class:`~soweego.importer.base_dump_extractor.BaseDumpExtractor`
implementation to process the dump
:param resolve: whether to resolve URLs found in catalog dumps or not
"""
filepaths = []

Expand Down Expand Up @@ -171,5 +169,5 @@ def refresh_dump(

@staticmethod
def _update_dump(dump_url: str, file_output_path: str):
"""Download the dump"""
"""Download the dump."""
client.download_file(dump_url, file_output_path)
9 changes: 9 additions & 0 deletions soweego/importer/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""`SQLAlchemy <https://www.sqlalchemy.org/>`_
Object Relational Mapper (*ORM*)
`declarations <https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping>`_,
implemented as a set of classes.
All class attributes are :class:`~sqlalchemy.Column` objects representing
columns of a SQL database table. Data types are detailed in the *Attributes*
section of each class.
"""
35 changes: 32 additions & 3 deletions soweego/importer/models/base_entity.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Base SQL Alchemy ORM entity"""
"""Base `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities."""

__author__ = 'Edoardo Lenzi'
__email__ = 'edoardolenzi9@gmail.com'
Expand All @@ -20,7 +20,24 @@


class BaseEntity(AbstractConcreteBase, BASE):
"""Each database entity should conform with this interface"""
"""Minimal ORM structure for a target catalog entry.
Each ORM entity should implement this interface.
**Attributes:**
- **internal_id** (integer) - an internal primary key
- **catalog_id** (string(50)) - a target catalog identifier
- **name** (text) - a full name (person), or full title (work)
- **name_tokens** (text) - a **name** tokenized through
:func:`~soweego.commons.text_utils.tokenize`
- **born** (date) - a birth (person), or publication (work) date
- **born_precision** (integer) - a birth (person), or publication (work)
date precision
- **died** (date) - a death date. Only applies to a person
- **died_precision** (integer) - a death date precision.
Only applies to a person
"""

__tablename__ = None
internal_id = Column(
Expand Down Expand Up @@ -60,7 +77,19 @@ def __repr__(self) -> str:


class BaseRelationship(AbstractConcreteBase, BASE):
"""Each database relationship should conform with this interface"""
"""Minimal ORM structure for a target catalog relationship
between entries. Each ORM relationship entity should implement this
interface.
You can build a relationship for different purposes:
typically, to connect works with people, or groups with individuals.
**Attributes:**
- **from_catalog_id** (string(50)) - a target catalog identifier
- **to_catalog_id** (string(50)) - a target catalog identifier
"""

__tablename__ = None
internal_id = Column(
Expand Down

0 comments on commit 4bb7d52

Please sign in to comment.