Merge branch 'dev'

Sieboldianus · Dec 6, 2019 · e2ce916 · e2ce916
2 parents 1a04728 + c7d3d47
commit e2ce916
Show file tree

Hide file tree

Showing 18 changed files with 486 additions and 165 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,19 +1,7 @@
-lbsnstructure/
-backup/
 build/
 tests/
 scripts/
-scripts/*
-.project
-.pydevproject
-.settings/
-.vs/
 .vscode/
-*.pyproj
-*.sln
-transferIncremental_AllJson_To_Lbsn.pyproj
-transferIncremental_AllJson_To_Lbsn.sln
-test_env
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,11 +1,32 @@
 test:
   image: registry.gitlab.vgiscience.org/tud_ifk/miniconda-cidefault
+  before_script:
+    # initialize conda shell
+    - conda init bash
+    - source ~/.bashrc
+    # activate default ci_env from registry image
+    # contains anybadge
+    - conda activate ci_env
+    - conda env create -f environment_dev.yml
+    # activate tagmaps env
+    # with --stack environments
+    - conda activate lbsntransform --stack
+    # install additional packages
+    # needed in lbsntransform ci job
+    - pip install git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.vgiscience.de/tud_ifk/argdown.git
+    - conda install pylint bitarray
+    - pip install pdoc3 pylint-exit
   stage: test
   script:
-    - conda env create -f environment_dev.yml
-    - source activate lbsntransform
     # test dev install
     - pip install -e . --no-dependencies
+    # argparse to markdown conversion
+    # for mkdocs args page
+    - python docs/argparse/argparse_doc.py
+    - argdown --truncate-help parse_args.py > docs/argparse/args.md
+    # to enable all warnings during ci doc generation
+    # - export PYTHONWARNINGS='error::UserWarning'
+    - pdoc --config show_type_annotations=True --template-dir docs/apidoc/ --html --output-dir docs/api/ lbsntransform
     # create badges
     - pylint --output-format=text lbsntransform | tee pylint.txt || pylint-exit $?
     - score=$(sed -n 's/^Your code has been rated at \([-0-9.]*\)\/.*/\1/p' pylint.txt)
@@ -15,25 +36,35 @@ test:
     - pypi_version=$(pip search lbsntransform | cut -d "(" -f2 | cut -d ")" -f1 | sed -r "s/[latest]+//g")
     - echo "Pypi version was $pypi_version"
     - anybadge -l pypi --value="$pypi_version" --file=pypi.svg --color=green
+    - anybadge -l mkdocs --value="Documentation" --file=documentation.svg --color=green
   artifacts:
     paths:
-      - pylint.svg
-      - pypi.svg
-      - pipeline.svg
+      # quote because * is a
+      # special character in YAML
+      - '*.svg'
+      - docs/argparse/args.md
+      - docs/api/
   only:
   - master
+  - ci-test
 
 pages:
   stage: deploy
+  image: registry.gitlab.vgiscience.org/tud_ifk/alpine-mkdocs
   script:
     - mkdir .public
-    - cp pylint.svg .public
-    - cp pipeline.svg .public
-    - cp pypi.svg .public
+    - cp *.svg .public
     - mv .public public
+    # build docs
+    - mkdocs build
+    - mv site public
+    # copy pdoc3 compiled API ref
+    - mv docs/api/lbsntransform/* public/site/api/
+    - mv public/site public/docs
   artifacts:
+    name: pages
     paths:
     - public
   only:
   - master
-
+  - ci-test
diff --git a/README.md b/README.md
@@ -1,9 +1,11 @@
-![PyPI version](https://lbsn.vgiscience.org/lbsntransform/pypi.svg) ![pylint](https://lbsn.vgiscience.org/lbsntransform/pylint.svg) ![pipeline](https://lbsn.vgiscience.org/lbsntransform/pipeline.svg)
+[![PyPI version](https://lbsn.vgiscience.org/lbsntransform/pypi.svg)](https://pypi.org/project/lbsntransform/) [![pylint](https://lbsn.vgiscience.org/lbsntransform/pylint.svg)](https://gitlab.vgiscience.de/lbsn/lbsntransform) [![pipeline](https://lbsn.vgiscience.org/lbsntransform/pipeline.svg)](https://gitlab.vgiscience.de/lbsn/lbsntransform) [![Documentation](https://lbsn.vgiscience.org/lbsntransform/documentation.svg)](https://lbsn.vgiscience.org/lbsntransform/docs/)
 
 # LBSNTransform
 
 A python package that uses the [common location based social network (LBSN) data structure concept](https://pypi.org/project/lbsnstructure/) (ProtoBuf) to import, transform and export Social Media data such as Twitter and Flickr.
 
+![Illustration of functions](docs/inputoutput.svg)
+
 ## Motivation
 
 The goal is to provide a common interface to handle Social Media Data, without custom adjustment to the myriad API Endpoints available. As an example, consider the ProtoBuf spec "Post", which can be a Tweet on Twitter, a Photo shared on Flickr, or a post on Reddit. This tool is based on a 4-Facet conceptual framework for LBSN, introduced in a paper by [Dunkel et al. (2018)](https://www.tandfonline.com/doi/full/10.1080/13658816.2018.1546390). In addition, the GDPR directly requests Social Media Network operators to allow users to transfer accounts and data inbetween services.
@@ -31,14 +33,14 @@ pip install lbsntransform
 e.g. with the following command line args
 
 ```shell
-lbsntransform --origin 3 --file_input --file_type '*.json' --transferlimit 1000 --csv_output
+lbsntransform --origin 3 --file_input --file_type 'json' --transferlimit 1000 --csv_output
 ```
 
 .. with the above input args, the the tool will:
 - read local json from /01_Input/
 - and store lbsn records as CSV and ProtoBuf in /02_Output/
 
-A full list of possible input args is available with `lbsntransform --help` [config.py](/lbsntransform/config/config.py)
+A full list of possible input args is available in the [documentation](https://lbsn.vgiscience.org/lbsntransform/docs/)
 
 ## Built With
 

diff --git a/docs/about.md b/docs/about.md
@@ -0,0 +1,7 @@
+lbsntransform package is maintained by Alexander Dunkel and [vgiscience.org](https://vgiscience.org)
+
+Found any errors or bugs? Please email me alexander.dunkel[ät]tu-dresden.de
+
+lbsntransform docs built with [mkdocs](https://github.com/mkdocs/mkdocs) and [ReadTheDocs theme](https://github.com/mkdocs/mkdocs/tree/master/mkdocs/themes/readthedocs).
+
+lbsntransform is developed under open source GNU GPLv3 License.
diff --git a/docs/apidoc/logo.mako b/docs/apidoc/logo.mako
@@ -0,0 +1,16 @@
+<header>
+    <h2>
+    <a class="homelink" rel="home" title="lbsntransform API Reference" href="https://lbsn.vgiscience.org/lbsntransform/docs/api/lbsntransform_.html">
+    lbsntransform API Reference
+    </a>
+    </h2>
+    <h5 style='line-height: 5px;'>
+    <a class="homelink" rel="home" title="lbsntransform Documentation Home" href="https://lbsn.vgiscience.org/lbsntransform/docs/">
+    lbsntransform Documentation (external)
+    </a>
+    </h5>
+    <h5 style='line-height: 0px;'>
+    <a href="https://github.com/Sieboldianus/lbsntransform">Edit on GitHub</a>
+    </h5>
+
+</header>
diff --git a/docs/argparse/argparse_doc.py b/docs/argparse/argparse_doc.py
@@ -0,0 +1,40 @@
+"""Script to allow argdown parse_args to markdown conversion"""
+
+import inspect
+from lbsntransform import BaseConfig
+from lbsntransform import __version__
+
+
+def extract_argscode():
+    """Extracts command line args code to separate file
+
+    Preparation step for processing with argdown
+    """
+    # open file to output source code
+    source_file = open("parse_args.py", "w")
+    # extract source code of parse_args
+    parse_args_source = inspect.getsource(BaseConfig.parse_args)
+    # remove first line
+    parse_args_source = parse_args_source[parse_args_source.index('\n')+1:]
+    # unindent all other lines
+    parse_args_source = parse_args_source.lstrip().replace('\n        ', '\n')
+    # replace version string
+    parse_args_source = parse_args_source.replace(
+        'lbsntransform {__version__}', f'lbsntransform {__version__}')
+    # replace package name
+    parse_args_source = parse_args_source.replace(
+        'usage: argdown', 'usage: lbsntransform')
+    # write argdown and argparse imports first
+    source_file.write('import argparse\n')
+    source_file.write('import argdown\n')
+    source_file.write('from pathlib import Path\n')
+    # fix argparse name
+    parse_args_source = parse_args_source.replace(
+        'ArgumentParser()', 'ArgumentParser(prog="lbsntransform")')
+    # write prepared source code
+    source_file.write(parse_args_source)
+    source_file.close()
+
+
+# run script
+extract_argscode()
diff --git a/docs/argparse/examples.md b/docs/argparse/examples.md
@@ -0,0 +1,118 @@
+lbsntransform has a Command Line Interface (CLI) that can be used to convert many input formats to common lbsnstructure, including to its privacy-aware hll implementation.
+
+!!! Note
+    Substitute bash linebreak character `\` in examples below with `^` if you're on Windows command line
+
+# Basic examples
+
+Key to mappings in lbsntransform is the origin id, which refers to the different mappings specified in modules `input/mappings/*.py`. For example,
+id `3` refers to Twitter (`field_mapping_twitter.py`).
+
+If you've retrieved Twitter jsons from the offocial API, place those files somewhere in a subfolder `.01_Input/` and run the following:
+
+```bash
+lbsntransform --origin 3 \
+    --file_input \
+    --file_type 'json' \
+    --transferlimit 1000 \
+    --csv_output
+
+```
+
+lbsntransform will then create a subfolder `.02_Output/` and store converted data as CSV (specified with `--csv_output` flag).
+
+* `--transferlimit 1000` means: skip transfer after 1000 lbsn records
+* `--file_input`: read from local files (and not from database). Default local input is subfolder `.01_Input/`. This path can be modified with the flag `--input_path_url my-input-path`
+* `--file_type 'json'` refers to the file ending to look for in `.01_Input/` folder
+
+If your files are spread across subdirectories in (e.g.) `.01_Input/`, add `--recursive_load` flag.
+
+# Flickr YFCC100m
+
+A specific mapping is provided for the [YFCC100m Dataset](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/).
+
+The YFCC100m Dataset consists of multiple files, with the core dataset of 100 Million Flickr photo metadata records (yfcc100m_dataset.csv) and several "expansion sets".
+
+The only expansion-set that is available for mapping is places-expansion (yfcc100m_places.csv).
+
+Both photo metadata and places metadata can be processed parrallel, by using `--zip_records`.
+
+Before executing the following, make sure you've started the [lbsn-raw database docker](https://gitlab.vgiscience.de/lbsn/databases/rawdb). This includes the postgres implementation of the common lbsn structure format. You can run the docker db container on any host, but we suggest testing your setup locally - in this case, `127.0.0.1` refers to _localhost_ and port `15432` (the default for lbsn-raw).
+
+
+```bash
+lbsntransform --origin 21 \
+    --file_input \
+    --input_path_url "https://myurltoflickrymcc.dataset.org/yfcc100m_dataset.csv;https://myurltoflickrymcc.dataset.org/flickr_yfcc100m/yfcc100m_places.csv" \
+    --dbpassword_output "your-db-password" \
+    --dbuser_output "postgres" \
+    --dbserveraddress_output "127.0.0.1:15432" \
+    --dbname_output "rawdb" \
+    --csv_delimiter $'\t' \
+    --file_type "csv" \
+    --zip_records \
+    --skip_until_record 7373485 \
+    --transferlimit 10000
+```
+
+In the example above,
+
+```bash
+--skip_until_record 7373485
+```
+.. is used to skip input records up to record `7373485`. This is an example on how to continue processing (e.g. if your previous transform job was aborted for any reason).
+
+
+Also, transfer is limited to first 10000 records:
+
+```bash
+--transferlimit 10000
+```
+
+If you have stored the Flickr-dataset locally, simply replace the urls with:
+
+```bash
+--input_path_url "/data/flickr_yfcc100m/"
+```
+
+
+# Privacy-aware output (HyperLogLog)
+
+We've developed a privacy-aware implementation of lbsn-raw format, based based on the probabilistic datastructure HyperLogLog and the postgres implementation from [Citus](https://github.com/citusdata/postgresql-hll).
+
+Two preparations steps are necessary:
+
+* Prepare a postgres database with the HLL version of lbsnstructure. You can use the [lbsn-hll database docker](https://gitlab.vgiscience.de/lbsn/databases/hlldb)
+* Prepare a read-only (empty) database with Citus HyperLogLog extension installed. You can use the [hll importer docker](https://gitlab.vgiscience.de/lbsn/tools/importer)
+
+We've designed this rather complex setup to separate concerns:
+- the importer db (called `hllworkerdb` in the command below) will be used by lbsntransform to calculate hll `shards` from raw data - it will not store any data, nor will it get any additional (privacy-relevant) information. Shards are calculated in-memory and returned. The importer is prepared with global hll-settings that must not change during the whole lifetime of the final output.
+
+For example, as a means of additional security, before creating shards, distinct values can be one-way-hashed. This hashing can be improved using a `salt` that is only known to **importer**.
+
+Finally, as a result, output hll db will not retrieve any privacy-relevant data because this is removed before transmission.
+
+!!! Note
+    Depending on chosen `bases` and the type of input data, data may still contain privacy sensitive references. Have a look at the [lbsn-docs](https://lbsn.vgiscience.org) for further information.
+
+To convert YFCC100m photo metadata and places and transfer to a local hll-db, use:
+
+```bash
+lbsntransform --origin 21 \
+    --file_input \
+    --input_path_url "/data/flickr_yfcc100m/" \
+    --dbpassword_output "your-db-password" \
+    --dbuser_output "postgres" \
+    --dbserveraddress_output "127.0.0.1:25432" \
+    --dbname_output "hlldb" \
+    --dbformat_output "hll" \
+    --dbpassword_hllworker "your-db-password" \
+    --dbuser_hllworker "postgres" \
+    --dbserveraddress_hllworker "127.0.0.1:20432" \
+    --dbname_hllworker "hllworkerdb" \
+    --csv_delimiter $'\t' \
+    --file_type "csv" \
+    --zip_records
+```
+
+
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,17 @@
+**lbsntransform** A python package that uses the [common location based social network (LBSN) data structure concept](https://pypi.org/project/lbsnstructure/) (ProtoBuf) to import, transform and export Social Media data such as Twitter and Flickr.
+
+![](inputoutput.svg)
+
+Import, convert and export Location Based Social Media (LBSM) data, such as from Twitter and Flickr, to a common data structure format (lbsnstructure). lbsntransform can also anonymize data into a privacy-aware version of lbsnstructure using HyperLogLog.
+
+Input can be:
+- local CSV or Json (stacked/regular/line separated)
+- a web-url to CSV/Json
+- Postgres DB connection
+
+Output can be:
+- local CSV
+- local file with ProtoBuf encoded records
+- local SQL file ready for "Import from" in Postgres LBSN db
+- Postgres DB connection with existing [LBSN RAW Structure](https://gitlab.vgiscience.de/lbsn/databases/rawdb)
+- Postgres DB connection with existing [LBSN HLL Structure](https://gitlab.vgiscience.de/lbsn/databases/hlldb), which is a privacy-aware version of lbsnstructure
diff --git a/docs/inputoutput.sequence b/docs/inputoutput.sequence
@@ -0,0 +1,12 @@
+Title: lbsntransform
+Note left of Input: Twitter / \\nFlickr/\\nYFCC100m ...
+Input->Output: from Local CSV or JSON
+Input->Output: stacked/ regular/ line separated
+Input->Output: recursive subfolders
+Input->Output: zip inputs
+Input->Output: stream from web 
+Note right of Output: LBSN Raw DB
+Note right of Output: LBSN Hll DB
+Input->Output: from live DB
+Output-->Input: to live DB
+Note left of Input: LBSN Raw DB
diff --git a/docs/inputoutput.svg b/docs/inputoutput.svg
diff --git a/docs/quick-guide.md b/docs/quick-guide.md
@@ -0,0 +1,11 @@
+# Windows
+
+There are many ways to install python tools:
+
+1. The recommended way to install the package is with `pip install lbsntransform`
+2. For Windows users, an alternative is to download the newest pre-compiled build from [releases](../../releases) and run `lbsntransform.exe`
+3. If you have problems with dependencies under windows, use [Gohlke wheels](<https://www.lfd.uci.edu/~gohlke/pythonlibs/>) or create an environment with with conda package manager, install all dependencies manually and then run `pip install lbsntransform --no-deps`
+
+# Linux
+
+* `pip install lbsntransform` is recommended to install lbsntransform in Linux.
diff --git a/docs/resources.md b/docs/resources.md
@@ -0,0 +1,2 @@
+
+- see the [lbsn docs](https://lbsn.vgiscience.org) for further info regarding the underlying data structure concept
diff --git a/environment_dev.yml b/environment_dev.yml
@@ -17,7 +17,6 @@ dependencies:
   - bitarray
   - nltk
   - pip:
-    - anybadge
     - pylint-exit
     - ppygis3
     - lbsnstructure>=0.5.0

diff --git a/lbsntransform/__main__.py b/lbsntransform/__main__.py
@@ -91,7 +91,9 @@ def main():
         transfer_reactions=config.transfer_reactions,
         ignore_non_geotagged=config.ignore_non_geotagged,
         min_geoaccuracy=config.min_geoaccuracy,
-        source_web=config.source_web)
+        source_web=config.source_web,
+        skip_until_record=config.skip_until_record,
+        zip_records=config.zip_records)
 
     # Manually add entries that need submission prior to parsing data
     # add_bundestag_group_example(import_mapper)