uplift documentation

Wikidata · Jul 11, 2019 · 4bb7d52 · 4bb7d52
1 parent 3cdcef8
commit 4bb7d52
Show file tree

Hide file tree

Showing 12 changed files with 162 additions and 79 deletions.
diff --git a/soweego/importer/__init__.py b/soweego/importer/__init__.py
@@ -1 +1 @@
-"""Importer module - Generic service for dump updating/importing"""
+"""Import target catalog dumps into a SQL database."""
diff --git a/soweego/importer/base_dump_extractor.py b/soweego/importer/base_dump_extractor.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""Dump extractor abstract class"""
+"""Abstract class for catalog dumps extraction."""
+
 import logging
 import warnings
-from typing import Iterable
+from typing import List, Optional
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
@@ -13,31 +14,34 @@
 __copyright__ = 'Copyleft 2018, Hjfocs'
 
 LOGGER = logging.getLogger(__name__)
+
+# Silence full-text index creation warning
 warnings.filterwarnings(
-    "ignore", message=".*rebuilding table to add column FTS_DOC_ID.*"
+    'ignore', message='.*rebuilding table to add column FTS_DOC_ID.*'
 )
 
 
 class BaseDumpExtractor:
-    """Defines where to download a certain dump and how to post-process it."""
+    """Method definitions to download catalog dumps, extract data, and
+    populate a database instance.
+    """
 
     def extract_and_populate(
-        self, dump_file_paths: Iterable[str], resolve: bool
-    ):
-        """Extract relevant data and populate SQL Alchemy entities accordingly.
-
-        :param dump_file_paths: Iterable of paths where downloaded dumps are
-        placed.
-        :param resolve: Tells if the system will resolve the urls to validate
-        them.
+            self, dump_file_paths: List[str], resolve: bool
+    ) -> None:
+        """Extract relevant data and populate
+        `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities accordingly.
+        Entities will be then persisted to a database instance.
+
+        :param dump_file_paths: paths to downloaded catalog dumps
+        :param resolve: whether to resolve URLs found in catalog dumps or not
         """
         raise NotImplementedError
 
-    def get_dump_download_urls(self) -> Iterable[str]:
-        """Get the dump download URL.
-        Useful if there is a way to compute the latest dump URL.
+    def get_dump_download_urls(self) -> Optional[List[str]]:
+        """Get the dump download URLs of a target catalog.
+        Useful if there is a way to compute the latest URLs.
 
         :return: the latest dumps URL
-        :rtype: Iterable[str]
         """
         raise NotImplementedError
diff --git a/soweego/importer/cli.py b/soweego/importer/cli.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""Click-command descriptions for the importer"""
+"""Importer CLI commands."""
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
@@ -19,4 +19,4 @@
 @click.group(name='importer', commands=CLI_COMMANDS)
 @click.pass_context
 def cli(_):
-    """Import target catalog dumps into the database."""
+    """Import target catalog dumps into a SQL database."""
diff --git a/soweego/importer/discogs_dump_extractor.py b/soweego/importer/discogs_dump_extractor.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""Discogs dump extractor"""
+"""`Discogs <https://www.discogs.com/>`_ dump extractor."""
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
@@ -15,7 +15,7 @@
 import shutil
 import xml.etree.ElementTree as et
 from datetime import date, datetime
-from typing import Iterable, Tuple
+from typing import Iterable, Tuple, List, Optional
 
 from lxml import etree
 from requests import get
@@ -44,7 +44,8 @@
 
 
 class DiscogsDumpExtractor(BaseDumpExtractor):
-    """Defines where to download Discogs dump and how to post-process it"""
+    """Download Discogs dumps, extract data, and
+    populate a database instance."""
 
     # Counters
     total_entities = 0
@@ -59,7 +60,7 @@ class DiscogsDumpExtractor(BaseDumpExtractor):
 
     _sqlalchemy_commit_every = 100_000
 
-    def get_dump_download_urls(self) -> Iterable[str]:
+    def get_dump_download_urls(self) -> Optional[List[str]]:
         urls = []
         response = get(DUMP_LIST_URL_TEMPLATE.format(date.today().year))
         root = et.fromstring(response.text)
@@ -76,15 +77,26 @@ def get_dump_download_urls(self) -> Iterable[str]:
                     break
         if not urls:
             LOGGER.error(
-                """Failed to get the Discogs dump download URL: are we at the
-                 very start of the year?"""
+                'Failed to get the Discogs dump download URL: are we at the '
+                'very start of the year?'
             )
             return None
         return urls
 
     def extract_and_populate(
-        self, dump_file_paths: Iterable[str], resolve: bool
-    ):
+        self, dump_file_paths: List[str], resolve: bool
+    ) -> None:
+        """Extract relevant data from the *artists* (people)
+        and *masters* (works) Discogs dumps, preprocess them, populate
+        `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
+        them to a database instance.
+
+        See :mod:`~soweego.importer.models.discogs_entity`
+        for the ORM definitions.
+
+        :param dump_file_paths: paths to downloaded catalog dumps
+        :param resolve: whether to resolve URLs found in catalog dumps or not
+        """
         self._process_artists_dump(dump_file_paths[0], resolve)
         self._process_masters_dump(dump_file_paths[1])
 

diff --git a/soweego/importer/imdb_dump_extractor.py b/soweego/importer/imdb_dump_extractor.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""IMDB dump extractor"""
+"""`IMDb <https://www.imdb.com/>`_ dump extractor."""
 
 __author__ = 'Andrea Tupini'
 __email__ = 'tupini07@gmail.com'
 __version__ = '1.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, tupini07'
+__copyright__ = 'Copyleft 2019, tupini07'
 
 import copy
 import csv
@@ -31,7 +31,8 @@
 
 
 class ImdbDumpExtractor(BaseDumpExtractor):
-    """Defines where to download Imdb dump and how to post-process it"""
+    """Download IMDb dumps, extract data, and
+    populate a database instance."""
 
     # Counters
     n_actors = 0
@@ -47,17 +48,11 @@ class ImdbDumpExtractor(BaseDumpExtractor):
     _sqlalchemy_commit_every = 100_000
 
     def get_dump_download_urls(self) -> List[str]:
-        """
-        :return: the urls from which to download the data dumps
-        the first URL is the one for the **person dump**, the
-        second downloads the **movie dump**
-        """
         return [DUMP_URL_PERSON_INFO, DUMP_URL_MOVIE_INFO]
 
     @staticmethod
     def _normalize_null(entity: Dict) -> None:
-        """
-        IMDB represents a null entry with \\N , this method converts
+        """IMDb represents a null entry with \\N , this method converts
         all \\N to None so that they're saved as null in the database.
         This is done for all 'entries' of a given entity.
 
@@ -74,16 +69,16 @@ def _normalize_null(entity: Dict) -> None:
     def extract_and_populate(
         self, dump_file_paths: List[str], resolve: bool
     ) -> None:
-        """
-        Extracts the data in the dumps (person and movie) and processes them.
-        It then proceeds to add the appropriate data to the database.
+        """Extract relevant data from the *name* (people) and *title* (works)
+        IMDb dumps, preprocess them, populate
+        `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
+        them to a database instance.
 
-        See
-        :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy
-        definition of the entities we use to save IMDB data.
+        See :mod:`~soweego.importer.models.imdb_entity`
+        for the ORM definitions.
 
-        :param dump_file_paths: the absolute paths of the already downloaded
-        dump files.
+        :param dump_file_paths: paths to downloaded catalog dumps
+        :param resolve: whether to resolve URLs found in catalog dumps or not
         """
 
         # the order of these files is specified in `self.get_dump_download_urls`

diff --git a/soweego/importer/importer.py b/soweego/importer/importer.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""Generic service for dump updating/importing"""
+"""Download, extract, and import a supported catalog."""
 
 __author__ = 'Massimo Frasson'
 __email__ = 'maxfrax@gmail.com'
 __version__ = '1.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Massimo Frasson'
+__copyright__ = 'Copyleft 2018, MaxFrax96'
 
 import datetime
 import logging
@@ -42,15 +42,17 @@
 @click.option(
     '--url-check',
     is_flag=True,
-    help='Check for rotten URLs while importing. Default: no.'
-    'WARNING: this will dramatically increase the import time.',
+    help=(
+            'Check for rotten URLs while importing. Default: no. '
+            'WARNING: this will dramatically increase the import time.'
+    ),
 )
 @click.option(
     '-d',
     '--dir-io',
     type=click.Path(file_okay=False),
     default=constants.SHARED_FOLDER,
-    help=f"Input/output directory," f"default: '{constants.SHARED_FOLDER}'.",
+    help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
 )
 def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
     """Download, extract, and import a supported catalog."""
@@ -69,11 +71,7 @@ def _resolve_url(res):
     'catalog', type=click.Choice(target_database.supported_targets())
 )
 def check_links_cli(catalog: str):
-    """
-    Check for rotten URLs of an imported catalog.
-
-    :param catalog: one of the keys of constants.TARGET_CATALOGS
-    """
+    """Check for rotten URLs of an imported catalog."""
     for entity_type in target_database.supported_entities_for_target(catalog):
 
         LOGGER.info("Validating %s %s links...", catalog, entity_type)
@@ -116,19 +114,19 @@ def check_links_cli(catalog: str):
 
 
 class Importer:
-    """Downloads the latest dump of a certain target"""
+    """Handle a catalog dump: check its freshness and dispatch the appropriate
+    extractor."""
 
     def refresh_dump(
         self, output_folder: str, extractor: BaseDumpExtractor, resolve: bool
     ):
-        """
-        Downloads the dump, if necessary, and calls the handler over the dump
-        file.
-        :param output_folder: folder in which the downloaded dumps will be
-        stored
-        :param extractor: BaseDumpExtractor implementation to process the dump
-        :param resolve: try to resolve each url in the dump to check if it
-        works?
+        """Eventually download the latest dump, and call the
+         corresponding extractor.
+
+        :param output_folder: a path where the downloaded dumps will be stored
+        :param extractor: :class:`~soweego.importer.base_dump_extractor.BaseDumpExtractor`
+          implementation to process the dump
+        :param resolve: whether to resolve URLs found in catalog dumps or not
         """
         filepaths = []
 
@@ -171,5 +169,5 @@ def refresh_dump(
 
     @staticmethod
     def _update_dump(dump_url: str, file_output_path: str):
-        """Download the dump"""
+        """Download the dump."""
         client.download_file(dump_url, file_output_path)
diff --git a/soweego/importer/models/__init__.py b/soweego/importer/models/__init__.py
@@ -0,0 +1,9 @@
+"""`SQLAlchemy <https://www.sqlalchemy.org/>`_
+Object Relational Mapper (*ORM*)
+`declarations <https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping>`_,
+implemented as a set of classes.
+
+All class attributes are :class:`~sqlalchemy.Column` objects representing
+columns of a SQL database table. Data types are detailed in the *Attributes*
+section of each class.
+"""
diff --git a/soweego/importer/models/base_entity.py b/soweego/importer/models/base_entity.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-"""Base SQL Alchemy ORM entity"""
+"""Base `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities."""
 
 __author__ = 'Edoardo Lenzi'
 __email__ = 'edoardolenzi9@gmail.com'
@@ -20,7 +20,24 @@
 
 
 class BaseEntity(AbstractConcreteBase, BASE):
-    """Each database entity should conform with this interface"""
+    """Minimal ORM structure for a target catalog entry.
+    Each ORM entity should implement this interface.
+
+    **Attributes:**
+
+    - **internal_id** (integer) - an internal primary key
+    - **catalog_id** (string(50)) - a target catalog identifier
+    - **name** (text) - a full name (person), or full title (work)
+    - **name_tokens** (text) - a **name** tokenized through
+      :func:`~soweego.commons.text_utils.tokenize`
+    - **born** (date) - a birth (person), or publication (work) date
+    - **born_precision** (integer) - a birth (person), or publication (work)
+      date precision
+    - **died** (date) - a death date. Only applies to a person
+    - **died_precision** (integer) - a death date precision.
+      Only applies to a person
+
+    """
 
     __tablename__ = None
     internal_id = Column(
@@ -60,7 +77,19 @@ def __repr__(self) -> str:
 
 
 class BaseRelationship(AbstractConcreteBase, BASE):
-    """Each database relationship should conform with this interface"""
+    """Minimal ORM structure for a target catalog relationship
+    between entries. Each ORM relationship entity should implement this
+    interface.
+
+    You can build a relationship for different purposes:
+    typically, to connect works with people, or groups with individuals.
+
+    **Attributes:**
+
+    - **from_catalog_id** (string(50)) - a target catalog identifier
+    - **to_catalog_id** (string(50)) - a target catalog identifier
+
+    """
 
     __tablename__ = None
     internal_id = Column(