Skip to content

Commit

Permalink
Merge pull request #339 from Wikidata/linker-refactoring-a
Browse files Browse the repository at this point in the history
Make the linker module shine
  • Loading branch information
marfox committed Jul 4, 2019
2 parents cf82439 + e5ac257 commit 849e414
Show file tree
Hide file tree
Showing 32 changed files with 2,923 additions and 2,163 deletions.
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ tqdm = "*"
lxml = "*"
tensorflow = "*"
keras = "*"
joblib = "*"

[dev-packages]
pylint = "*"
Expand All @@ -30,3 +31,4 @@ black = "*"
autopep8 = "*"
sphinx = "*"
sphinx-autodoc-typehints = "*"
sphinx-click = "*"
317 changes: 167 additions & 150 deletions Pipfile.lock

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions docs/cli.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
The Command Line
================

.. note::


Importer
--------

Expand Down
2 changes: 2 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
'sphinx_autodoc_typehints',
'sphinx_click.ext'
]
Expand Down Expand Up @@ -87,6 +88,7 @@
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
'recordlinkage': ('https://recordlinkage.readthedocs.io/en/latest/', None),
'requests': ('https://2.python-requests.org/en/stable/', None),
'sklearn': ('https://scikit-learn.org/stable/', None),
}
2 changes: 2 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ API documentation
.. toctree::
:maxdepth: 2

cli
ingestor
linker
validator
wikidata

Expand Down
66 changes: 66 additions & 0 deletions docs/linker.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
:mod:`soweego.linker`
========================

.. automodule:: soweego.linker
:members:


:mod:`soweego.linker.workflow`
--------------------------------------

.. automodule:: soweego.linker.workflow
:members:


:mod:`soweego.linker.blocking`
--------------------------------------

.. automodule:: soweego.linker.blocking
:members:


:mod:`soweego.linker.features`
--------------------------------------

.. automodule:: soweego.linker.features

.. autoclass:: ExactMatch
:special-members: __init__

.. autoclass:: SimilarStrings
:special-members: __init__

.. autoclass:: SimilarDates
:special-members: __init__

.. autoclass:: SharedTokens
:special-members: __init__

.. autoclass:: SharedOccupations
:special-members: __init__

.. autoclass:: SharedTokensPlus
:special-members: __init__


:mod:`soweego.linker.classifiers`
--------------------------------------

.. automodule:: soweego.linker.classifiers
:members:


:mod:`soweego.linker.train`
--------------------------------------

.. automodule:: soweego.linker.train
:members:


:mod:`soweego.linker.link`
--------------------------------------

.. automodule:: soweego.linker.link
:members:


60 changes: 29 additions & 31 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,24 @@ absl-py==0.7.1
alabaster==0.7.12
appdirs==1.4.3
appnope==0.1.0
astor==0.7.1
astor==0.8.0
astroid==2.2.5
attrs==19.1.0
autoflake==1.3
autopep8==1.4.4
Babel==2.7.0
backcall==0.1.0
black==19.3b0
certifi==2019.3.9
certifi==2019.6.16
chardet==3.0.4
Click==7.0
decorator==4.4.0
docutils==0.14
entrypoints==0.3
flake8==3.7.7
gast==0.2.2
graphviz==0.11
grpcio==1.20.1
google-pasta==0.1.7
grpcio==1.22.0rc1
h5py==2.9.0
idna==2.8
imagesize==1.1.0
Expand All @@ -28,67 +28,65 @@ ipython==7.5.0
ipython-genutils==0.2.0
iso8601==0.1.12
isort==4.3.20
jedi==0.13.3
jellyfish==0.7.1
jedi==0.14.0
jellyfish==0.7.2
Jinja2==2.10.1
joblib==0.13.2
Keras==2.2.4
Keras-Applications==1.0.7
Keras-Preprocessing==1.0.9
lazy-object-proxy==1.3.1
lxml==4.3.3
Markdown==3.1
Keras-Applications==1.0.8
Keras-Preprocessing==1.1.0
lazy-object-proxy==1.4.1
lxml==4.3.4
Markdown==3.1.1
MarkupSafe==1.1.1
mccabe==0.6.1
mem-top==0.1.6
mock==2.0.0
numpy==1.16.3
numpy==1.16.4
packaging==19.0
objgraph==3.4.1
pandas==0.24.2
parso==0.4.0
pbr==5.2.1
parso==0.5.0
pbr==5.3.1
pexpect==4.7.0
pickleshare==0.7.5
prompt-toolkit==2.0.9
protobuf==3.7.1
protobuf==3.8.0
ptyprocess==0.6.0
pycodestyle==2.5.0
pyflakes==2.1.1
Pygments==2.3.1
Pygments==2.4.2
pylint==2.3.1
PyMySQL==0.9.3
pyparsing==2.4.0
python-dateutil==2.8.0
pytz==2019.1
pywikibot==3.0.20190430
PyYAML==5.1
PyYAML==5.1.1
recordlinkage==0.13.2
regex==2019.4.14
requests==2.21.0
regex==2019.6.8
requests==2.22.0
rope==0.14.0
scikit-learn==0.20.3
scipy==1.2.1
scikit-learn==0.21.2
scipy==1.3.0
six==1.12.0
snowballstemmer==1.2.1
Sphinx==2.1.0
Sphinx==2.1.2
sphinx-autodoc-typehints==1.6.0
sphinx-click==2.2.0
sphinxcontrib-applehelp==1.0.1
sphinxcontrib-devhelp==1.0.1
sphinxcontrib-htmlhelp==1.0.2
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.3
SQLAlchemy==1.3.3
SQLAlchemy==1.3.5
tensorboard==1.13.1
tensorflow==1.13.1
tensorflow-estimator==1.13.0
termcolor==1.1.0
toml==0.10.0
tqdm==4.31.1
tqdm==4.32.2
traitlets==4.3.2
typed-ast==1.3.4
urllib3==1.24.2
typed-ast==1.4.0
urllib3==1.25.3
wcwidth==0.1.7
Werkzeug==0.15.2
wrapt==1.11.1
Werkzeug==0.15.4
wrapt==1.11.2
6 changes: 6 additions & 0 deletions soweego/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import logging

import click
import tensorflow as tf

from soweego import commons
from soweego import pipeline as pipeline_cli
Expand All @@ -20,6 +21,11 @@
from soweego.linker import cli as linker_cli
from soweego.validator import cli as validator_cli

# set env variable to ignore tensorflow warnings
# (only errors are printed)
tf.logging.set_verbosity(tf.logging.ERROR)


CLI_COMMANDS = {
'importer': importer_cli.cli,
'ingest': ingestor_cli.cli,
Expand Down
79 changes: 55 additions & 24 deletions soweego/commons/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, Hjfocs'

import os
from typing import TypeVar

from soweego.commons import keys
Expand Down Expand Up @@ -48,7 +49,9 @@
DEFAULT_CREDENTIALS_MODULE = 'soweego.importer.resources'
DEFAULT_CREDENTIALS_FILENAME = 'credentials.json'
DEFAULT_CREDENTIALS_LOCATION = (
DEFAULT_CREDENTIALS_MODULE, DEFAULT_CREDENTIALS_FILENAME)
DEFAULT_CREDENTIALS_MODULE,
DEFAULT_CREDENTIALS_FILENAME,
)
CREDENTIALS_LOCATION = '/app/shared/credentials.json'

# As per https://meta.wikimedia.org/wiki/User-Agent_policy
Expand Down Expand Up @@ -216,31 +219,56 @@
keys.REAL_NAME,
)

# File names & folders
# Folders
SHARED_FOLDER = '/app/shared/'
WD_TRAINING_SET = 'wikidata/wikidata_%s_%s_training_set.jsonl.gz'
WD_CLASSIFICATION_SET = 'wikidata/wikidata_%s_%s_classification_set.jsonl.gz'
SAMPLES = 'samples/%s_%s_%s_samples%02d.pkl.gz'
FEATURES = 'features/%s_%s_%s_features%02d.pkl.gz'
LINKER_MODEL = 'models/%s_%s_%s_model.pkl'
LINKER_NESTED_CV_BEST_MODEL = '%models/s_%s_%s_best_model_k%02d.pkl'
LINKER_RESULT = 'results/%s_%s_%s_linker_result.csv.gz'
LINKER_EVALUATION_PREDICTIONS = (
'results/%s_%s_%s_linker_evaluation_predictions.csv.gz'
WD_FOLDER = 'wikidata'
SAMPLES_FOLDER = 'samples'
FEATURES_FOLDERS = 'features'
MODELS_FOLDER = 'models'
RESULTS_FOLDER = 'results'
NN_CHECKPOINT_FOLDER = 'best_model_checkpoint'

# File names
NN_CHECKPOINT_FILENAME = '{}_best_checkpoint_model.hdf5'
EVALUATION_PERFORMANCE_FILENAME = '{}_{}_{}_performance.txt'
EVALUATION_PREDICTIONS_FILENAME = '{}_{}_{}_evaluation_links.csv.gz'
RESULT_FILENAME = '{}_{}_{}_links.csv.gz'
NESTED_CV_BEST_MODEL_FILENAME = '{}_{}_{}_best_model_k{:02}.pkl'
MODEL_FILENAME = '{}_{}_{}_model.pkl'
FEATURES_FILENAME = '{}_{}_{}_features{:02}.pkl.gz'
SAMPLES_FILENAME = '{}_{}_{}_samples{:02}.pkl.gz'
WD_CLASSIFICATION_SET_FILENAME = 'wikidata_{}_{}_classification_set.jsonl.gz'
WD_TRAINING_SET_FILENAME = 'wikidata_{}_{}_training_set.jsonl.gz'
EXTRACTED_LINKS_FILENAME = '{}_{}_extracted_links.csv'
BASELINE_PERFECT_FILENAME = '{}_{}_baseline_perfect_names.csv'
BASELINE_LINKS_FILENAME = '{}_{}_baseline_similar_links.csv'
BASELINE_NAMES_FILENAME = '{}_{}_baseline_similar_names.csv'
WIKIDATA_API_SESSION = 'wd_api_session.pkl'
WORKS_BY_PEOPLE_STATEMENTS = '%s_works_by_%s_statements.csv'

# Paths
WD_TRAINING_SET = os.path.join(WD_FOLDER, WD_TRAINING_SET_FILENAME)
WD_CLASSIFICATION_SET = os.path.join(WD_FOLDER, WD_CLASSIFICATION_SET_FILENAME)
SAMPLES = os.path.join(SAMPLES_FOLDER, SAMPLES_FILENAME)
FEATURES = os.path.join(FEATURES_FOLDERS, FEATURES_FILENAME)
LINKER_MODEL = os.path.join(MODELS_FOLDER, MODEL_FILENAME)
LINKER_NESTED_CV_BEST_MODEL = os.path.join(
MODELS_FOLDER, NESTED_CV_BEST_MODEL_FILENAME
)
LINKER_PERFORMANCE = 'results/%s_%s_%s_linker_performance.txt'
NEURAL_NETWORK_CHECKPOINT_MODEL = (
'best_model_checkpoint/%s_best_checkpoint_model.hdf5'
LINKER_RESULT = os.path.join(RESULTS_FOLDER, RESULT_FILENAME)
LINKER_EVALUATION_PREDICTIONS = os.path.join(
RESULTS_FOLDER, EVALUATION_PREDICTIONS_FILENAME
)
COMPLETE_FEATURE_VECTORS = 'features/%s_%s_%s_complete_feature_vectors.pkl.gz'
COMPLETE_WIKIDATA_CHUNKS = 'wikidata/%s_%s_%s_complete_wikidata_chunks.pkl.gz'
COMPLETE_TARGET_CHUNKS = 'samples/%s_%s_%s_complete_target_chunks.pkl.gz'
COMPLETE_POSITIVE_SAMPLES_INDEX = (
'samples/%s_%s_%s_complete_positive_samples_index.pkl.gz'
LINKER_PERFORMANCE = os.path.join(
RESULTS_FOLDER, EVALUATION_PERFORMANCE_FILENAME
)
WIKIDATA_API_SESSION = 'wd_api_session.pkl'
WORKS_BY_PEOPLE_STATEMENTS = '%s_works_by_%s_statements.csv'
TENSOR_BOARD = 'tensor_board/'
NEURAL_NETWORK_CHECKPOINT_MODEL = os.path.join(
NN_CHECKPOINT_FOLDER, NN_CHECKPOINT_FILENAME
)
EXTRACTED_LINKS = os.path.join(RESULTS_FOLDER, EXTRACTED_LINKS_FILENAME)
BASELINE_PERFECT = os.path.join(RESULTS_FOLDER, BASELINE_PERFECT_FILENAME)
BASELINE_LINKS = os.path.join(RESULTS_FOLDER, BASELINE_LINKS_FILENAME)
BASELINE_NAMES = os.path.join(RESULTS_FOLDER, BASELINE_NAMES_FILENAME)

CLASSIFIERS = {
'naive_bayes': keys.NAIVE_BAYES,
Expand Down Expand Up @@ -283,13 +311,16 @@
FEATURE_MISSING_VALUE = 0.0

# Neural networks-specific
ACTIVATION = 'sigmoid'
OPTIMIZER = 'adam'
OUTPUT_ACTIVATION = 'sigmoid'
HIDDEN_ACTIVATION = 'relu'
SLP_OPTIMIZER = 'adam'
MLP_OPTIMIZER = 'adadelta'
LOSS = 'binary_crossentropy'
METRICS = ['accuracy']
BATCH_SIZE = 1024
EPOCHS = 1000
VALIDATION_SPLIT = 0.33
NAIVE_BAYES_BINARIZE = 0.1

# precisions for the `pandas.Period` class.
# Listed from least to most precise, as defined here:
Expand Down

0 comments on commit 849e414

Please sign in to comment.