Merge pull request #339 from Wikidata/linker-refactoring-a

Make the linker module shine
Wikidata · Jul 4, 2019 · 849e414 · 849e414
2 parents cf82439 + e5ac257
commit 849e414
Show file tree

Hide file tree

Showing 32 changed files with 2,923 additions and 2,163 deletions.
diff --git a/Pipfile b/Pipfile
@@ -17,6 +17,7 @@ tqdm = "*"
 lxml = "*"
 tensorflow = "*"
 keras = "*"
+joblib = "*"
 
 [dev-packages]
 pylint = "*"
@@ -30,3 +31,4 @@ black = "*"
 autopep8 = "*"
 sphinx = "*"
 sphinx-autodoc-typehints = "*"
+sphinx-click = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -1,9 +1,6 @@
 The Command Line
 ================
 
-.. note::
-
-
 Importer
 --------
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -32,6 +32,7 @@
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.intersphinx',
+    'sphinx.ext.viewcode',
     'sphinx_autodoc_typehints',
     'sphinx_click.ext'
 ]
@@ -87,6 +88,7 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3/', None),
     'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
+    'recordlinkage': ('https://recordlinkage.readthedocs.io/en/latest/', None),
     'requests': ('https://2.python-requests.org/en/stable/', None),
     'sklearn': ('https://scikit-learn.org/stable/', None),
 }
diff --git a/docs/index.rst b/docs/index.rst
@@ -143,7 +143,9 @@ API documentation
 .. toctree::
    :maxdepth: 2
 
+   cli
    ingestor
+   linker
    validator
    wikidata
 

diff --git a/docs/linker.rst b/docs/linker.rst
@@ -0,0 +1,66 @@
+:mod:`soweego.linker`
+========================
+
+.. automodule:: soweego.linker
+    :members:
+
+
+:mod:`soweego.linker.workflow`
+--------------------------------------
+
+.. automodule:: soweego.linker.workflow
+    :members:
+
+
+:mod:`soweego.linker.blocking`
+--------------------------------------
+
+.. automodule:: soweego.linker.blocking
+    :members:
+
+
+:mod:`soweego.linker.features`
+--------------------------------------
+
+.. automodule:: soweego.linker.features
+
+.. autoclass:: ExactMatch
+    :special-members: __init__
+
+.. autoclass:: SimilarStrings
+    :special-members: __init__
+
+.. autoclass:: SimilarDates
+    :special-members: __init__
+
+.. autoclass:: SharedTokens
+    :special-members: __init__
+
+.. autoclass:: SharedOccupations
+    :special-members: __init__
+
+.. autoclass:: SharedTokensPlus
+    :special-members: __init__
+
+
+:mod:`soweego.linker.classifiers`
+--------------------------------------
+
+.. automodule:: soweego.linker.classifiers
+    :members:
+
+
+:mod:`soweego.linker.train`
+--------------------------------------
+
+.. automodule:: soweego.linker.train
+    :members:
+
+
+:mod:`soweego.linker.link`
+--------------------------------------
+
+.. automodule:: soweego.linker.link
+    :members:
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -2,24 +2,24 @@ absl-py==0.7.1
 alabaster==0.7.12
 appdirs==1.4.3
 appnope==0.1.0
-astor==0.7.1
+astor==0.8.0
 astroid==2.2.5
 attrs==19.1.0
 autoflake==1.3
 autopep8==1.4.4
 Babel==2.7.0
 backcall==0.1.0
 black==19.3b0
-certifi==2019.3.9
+certifi==2019.6.16
 chardet==3.0.4
 Click==7.0
 decorator==4.4.0
 docutils==0.14
 entrypoints==0.3
 flake8==3.7.7
 gast==0.2.2
-graphviz==0.11
-grpcio==1.20.1
+google-pasta==0.1.7
+grpcio==1.22.0rc1
 h5py==2.9.0
 idna==2.8
 imagesize==1.1.0
@@ -28,67 +28,65 @@ ipython==7.5.0
 ipython-genutils==0.2.0
 iso8601==0.1.12
 isort==4.3.20
-jedi==0.13.3
-jellyfish==0.7.1
+jedi==0.14.0
+jellyfish==0.7.2
 Jinja2==2.10.1
 joblib==0.13.2
 Keras==2.2.4
-Keras-Applications==1.0.7
-Keras-Preprocessing==1.0.9
-lazy-object-proxy==1.3.1
-lxml==4.3.3
-Markdown==3.1
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.0
+lazy-object-proxy==1.4.1
+lxml==4.3.4
+Markdown==3.1.1
 MarkupSafe==1.1.1
 mccabe==0.6.1
-mem-top==0.1.6
-mock==2.0.0
-numpy==1.16.3
+numpy==1.16.4
 packaging==19.0
-objgraph==3.4.1
 pandas==0.24.2
-parso==0.4.0
-pbr==5.2.1
+parso==0.5.0
+pbr==5.3.1
 pexpect==4.7.0
 pickleshare==0.7.5
 prompt-toolkit==2.0.9
-protobuf==3.7.1
+protobuf==3.8.0
 ptyprocess==0.6.0
 pycodestyle==2.5.0
 pyflakes==2.1.1
-Pygments==2.3.1
+Pygments==2.4.2
 pylint==2.3.1
 PyMySQL==0.9.3
 pyparsing==2.4.0
 python-dateutil==2.8.0
 pytz==2019.1
 pywikibot==3.0.20190430
-PyYAML==5.1
+PyYAML==5.1.1
 recordlinkage==0.13.2
-regex==2019.4.14
-requests==2.21.0
+regex==2019.6.8
+requests==2.22.0
 rope==0.14.0
-scikit-learn==0.20.3
-scipy==1.2.1
+scikit-learn==0.21.2
+scipy==1.3.0
 six==1.12.0
 snowballstemmer==1.2.1
-Sphinx==2.1.0
+Sphinx==2.1.2
 sphinx-autodoc-typehints==1.6.0
+sphinx-click==2.2.0
 sphinxcontrib-applehelp==1.0.1
 sphinxcontrib-devhelp==1.0.1
 sphinxcontrib-htmlhelp==1.0.2
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
-SQLAlchemy==1.3.3
+SQLAlchemy==1.3.5
 tensorboard==1.13.1
 tensorflow==1.13.1
 tensorflow-estimator==1.13.0
 termcolor==1.1.0
 toml==0.10.0
-tqdm==4.31.1
+tqdm==4.32.2
 traitlets==4.3.2
-typed-ast==1.3.4
-urllib3==1.24.2
+typed-ast==1.4.0
+urllib3==1.25.3
 wcwidth==0.1.7
-Werkzeug==0.15.2
-wrapt==1.11.1
+Werkzeug==0.15.4
+wrapt==1.11.2
diff --git a/...resources/discogs_properties_mapping.json → ...ts/legacy/discogs_properties_mapping.json b/...resources/discogs_properties_mapping.json → ...ts/legacy/discogs_properties_mapping.json
diff --git a/...urces/musicbrainz_properties_mapping.json → ...egacy/musicbrainz_properties_mapping.json b/...urces/musicbrainz_properties_mapping.json → ...egacy/musicbrainz_properties_mapping.json
diff --git a/...resources/musicbrainz_url_formatters.json → ...ts/legacy/musicbrainz_url_formatters.json b/...resources/musicbrainz_url_formatters.json → ...ts/legacy/musicbrainz_url_formatters.json
diff --git a/soweego/cli.py b/soweego/cli.py
@@ -12,6 +12,7 @@
 import logging
 
 import click
+import tensorflow as tf
 
 from soweego import commons
 from soweego import pipeline as pipeline_cli
@@ -20,6 +21,11 @@
 from soweego.linker import cli as linker_cli
 from soweego.validator import cli as validator_cli
 
+# set env variable to ignore tensorflow warnings
+# (only errors are printed)
+tf.logging.set_verbosity(tf.logging.ERROR)
+
+
 CLI_COMMANDS = {
     'importer': importer_cli.cli,
     'ingest': ingestor_cli.cli,

diff --git a/soweego/commons/constants.py b/soweego/commons/constants.py
@@ -9,6 +9,7 @@
 __license__ = 'GPL-3.0'
 __copyright__ = 'Copyleft 2018, Hjfocs'
 
+import os
 from typing import TypeVar
 
 from soweego.commons import keys
@@ -48,7 +49,9 @@
 DEFAULT_CREDENTIALS_MODULE = 'soweego.importer.resources'
 DEFAULT_CREDENTIALS_FILENAME = 'credentials.json'
 DEFAULT_CREDENTIALS_LOCATION = (
-    DEFAULT_CREDENTIALS_MODULE, DEFAULT_CREDENTIALS_FILENAME)
+    DEFAULT_CREDENTIALS_MODULE,
+    DEFAULT_CREDENTIALS_FILENAME,
+)
 CREDENTIALS_LOCATION = '/app/shared/credentials.json'
 
 # As per https://meta.wikimedia.org/wiki/User-Agent_policy
@@ -216,31 +219,56 @@
     keys.REAL_NAME,
 )
 
-# File names & folders
+# Folders
 SHARED_FOLDER = '/app/shared/'
-WD_TRAINING_SET = 'wikidata/wikidata_%s_%s_training_set.jsonl.gz'
-WD_CLASSIFICATION_SET = 'wikidata/wikidata_%s_%s_classification_set.jsonl.gz'
-SAMPLES = 'samples/%s_%s_%s_samples%02d.pkl.gz'
-FEATURES = 'features/%s_%s_%s_features%02d.pkl.gz'
-LINKER_MODEL = 'models/%s_%s_%s_model.pkl'
-LINKER_NESTED_CV_BEST_MODEL = '%models/s_%s_%s_best_model_k%02d.pkl'
-LINKER_RESULT = 'results/%s_%s_%s_linker_result.csv.gz'
-LINKER_EVALUATION_PREDICTIONS = (
-    'results/%s_%s_%s_linker_evaluation_predictions.csv.gz'
+WD_FOLDER = 'wikidata'
+SAMPLES_FOLDER = 'samples'
+FEATURES_FOLDERS = 'features'
+MODELS_FOLDER = 'models'
+RESULTS_FOLDER = 'results'
+NN_CHECKPOINT_FOLDER = 'best_model_checkpoint'
+
+# File names
+NN_CHECKPOINT_FILENAME = '{}_best_checkpoint_model.hdf5'
+EVALUATION_PERFORMANCE_FILENAME = '{}_{}_{}_performance.txt'
+EVALUATION_PREDICTIONS_FILENAME = '{}_{}_{}_evaluation_links.csv.gz'
+RESULT_FILENAME = '{}_{}_{}_links.csv.gz'
+NESTED_CV_BEST_MODEL_FILENAME = '{}_{}_{}_best_model_k{:02}.pkl'
+MODEL_FILENAME = '{}_{}_{}_model.pkl'
+FEATURES_FILENAME = '{}_{}_{}_features{:02}.pkl.gz'
+SAMPLES_FILENAME = '{}_{}_{}_samples{:02}.pkl.gz'
+WD_CLASSIFICATION_SET_FILENAME = 'wikidata_{}_{}_classification_set.jsonl.gz'
+WD_TRAINING_SET_FILENAME = 'wikidata_{}_{}_training_set.jsonl.gz'
+EXTRACTED_LINKS_FILENAME = '{}_{}_extracted_links.csv'
+BASELINE_PERFECT_FILENAME = '{}_{}_baseline_perfect_names.csv'
+BASELINE_LINKS_FILENAME = '{}_{}_baseline_similar_links.csv'
+BASELINE_NAMES_FILENAME = '{}_{}_baseline_similar_names.csv'
+WIKIDATA_API_SESSION = 'wd_api_session.pkl'
+WORKS_BY_PEOPLE_STATEMENTS = '%s_works_by_%s_statements.csv'
+
+# Paths
+WD_TRAINING_SET = os.path.join(WD_FOLDER, WD_TRAINING_SET_FILENAME)
+WD_CLASSIFICATION_SET = os.path.join(WD_FOLDER, WD_CLASSIFICATION_SET_FILENAME)
+SAMPLES = os.path.join(SAMPLES_FOLDER, SAMPLES_FILENAME)
+FEATURES = os.path.join(FEATURES_FOLDERS, FEATURES_FILENAME)
+LINKER_MODEL = os.path.join(MODELS_FOLDER, MODEL_FILENAME)
+LINKER_NESTED_CV_BEST_MODEL = os.path.join(
+    MODELS_FOLDER, NESTED_CV_BEST_MODEL_FILENAME
 )
-LINKER_PERFORMANCE = 'results/%s_%s_%s_linker_performance.txt'
-NEURAL_NETWORK_CHECKPOINT_MODEL = (
-    'best_model_checkpoint/%s_best_checkpoint_model.hdf5'
+LINKER_RESULT = os.path.join(RESULTS_FOLDER, RESULT_FILENAME)
+LINKER_EVALUATION_PREDICTIONS = os.path.join(
+    RESULTS_FOLDER, EVALUATION_PREDICTIONS_FILENAME
 )
-COMPLETE_FEATURE_VECTORS = 'features/%s_%s_%s_complete_feature_vectors.pkl.gz'
-COMPLETE_WIKIDATA_CHUNKS = 'wikidata/%s_%s_%s_complete_wikidata_chunks.pkl.gz'
-COMPLETE_TARGET_CHUNKS = 'samples/%s_%s_%s_complete_target_chunks.pkl.gz'
-COMPLETE_POSITIVE_SAMPLES_INDEX = (
-    'samples/%s_%s_%s_complete_positive_samples_index.pkl.gz'
+LINKER_PERFORMANCE = os.path.join(
+    RESULTS_FOLDER, EVALUATION_PERFORMANCE_FILENAME
 )
-WIKIDATA_API_SESSION = 'wd_api_session.pkl'
-WORKS_BY_PEOPLE_STATEMENTS = '%s_works_by_%s_statements.csv'
-TENSOR_BOARD = 'tensor_board/'
+NEURAL_NETWORK_CHECKPOINT_MODEL = os.path.join(
+    NN_CHECKPOINT_FOLDER, NN_CHECKPOINT_FILENAME
+)
+EXTRACTED_LINKS = os.path.join(RESULTS_FOLDER, EXTRACTED_LINKS_FILENAME)
+BASELINE_PERFECT = os.path.join(RESULTS_FOLDER, BASELINE_PERFECT_FILENAME)
+BASELINE_LINKS = os.path.join(RESULTS_FOLDER, BASELINE_LINKS_FILENAME)
+BASELINE_NAMES = os.path.join(RESULTS_FOLDER, BASELINE_NAMES_FILENAME)
 
 CLASSIFIERS = {
     'naive_bayes': keys.NAIVE_BAYES,
@@ -283,13 +311,16 @@
 FEATURE_MISSING_VALUE = 0.0
 
 # Neural networks-specific
-ACTIVATION = 'sigmoid'
-OPTIMIZER = 'adam'
+OUTPUT_ACTIVATION = 'sigmoid'
+HIDDEN_ACTIVATION = 'relu'
+SLP_OPTIMIZER = 'adam'
+MLP_OPTIMIZER = 'adadelta'
 LOSS = 'binary_crossentropy'
 METRICS = ['accuracy']
 BATCH_SIZE = 1024
 EPOCHS = 1000
 VALIDATION_SPLIT = 0.33
+NAIVE_BAYES_BINARIZE = 0.1
 
 # precisions for the `pandas.Period` class.
 # Listed from least to most precise, as defined here: