TranslatorSRI · gaurav · Jan 18, 2023 · Sep 9, 2022 · Sep 9, 2022 · Sep 9, 2022
diff --git a/csv2json.py b/csv2json.py
diff --git a/data-loading/.dockerignore b/data-loading/.dockerignore
@@ -0,0 +1 @@
+dev
diff --git a/data-loading/.gitignore b/data-loading/.gitignore
@@ -0,0 +1,2 @@
+# Data directory for input files in data loading.
+data/
diff --git a/data-loading/Dockerfile b/data-loading/Dockerfile
@@ -0,0 +1,78 @@
+# Dockerfile for NameResolution data-loading
+#
+# A NameResolution worker is a SOLR instance that downloads its source file a SOLR database backup.
+# (see ../Dockerfile and [1] for details). This Dockerfile is intended to create a SOLR instance that can load the
+# data from a set of Babel synonym files (see https://github.com/TranslatorSRI/Babel), thus creating the SOLR backup
+# which can be uploaded somewhere for the NameResolution workers.
+#
+# [1] https://github.com/helxplatform/translator-devops/blob/affcf34cf103230d25bdb859098d2a5ac81a49fb/helm/name-lookup/templates/scripts-config-map.yaml#L8-L105
+
+# Use the RENCI Python image to make it easier to work with other
+# RENCI Docker packages and to make sure we have an up to date image.
+# (https://github.com/TranslatorSRI/RENCI-Python-image)
+FROM renciorg/renci-python-image:latest
+
+# Configuration options:
+# - ${ROOT} is where the source code will be copied.
+ARG ROOT=/code/nameres-data-loading
+# - ${SOLR_VERSION} is the SOLR version to install.
+ARG SOLR_VERSION=9.1.0
+# - ${SOLR_DIR} is the SOLR directory to use.
+ARG SOLR_DIR=/var/solr
+
+# Upgrade system files
+RUN apt update
+RUN apt -y upgrade
+
+# Install or upgrade some prerequisite packages.
+RUN apt install -y python3-venv
+RUN pip3 install --upgrade pip
+
+# We need Java 11 to run SOLR.
+RUN apt install -y openjdk-11-jre
+
+# SOLR uses lsof to check on its status.
+RUN apt install -y lsof
+
+# The following packages are useful in debugging, but can be
+# removed once this container is working properly.
+RUN apt install -y htop
+RUN apt install -y screen
+RUN apt install -y vim
+RUN apt install -y rsync
+
+# Create a /var/solr directory for SOLR to use.
+VOLUME ${SOLR_DIR}
+RUN mkdir -p ${SOLR_DIR}
+RUN chown nru:nru ${SOLR_DIR}
+
+# Switch to nru user.
+RUN mkdir -p ${ROOT}
+RUN chown nru:nru ${ROOT}
+WORKDIR ${ROOT}
+USER nru
+
+# Download Solr into the Solr directory.
+RUN mkdir -p "${ROOT}/solr"
+ADD --chown=nru:nru https://www.apache.org/dyn/closer.lua/solr/solr/${SOLR_VERSION}/solr-${SOLR_VERSION}.tgz?action=download "${ROOT}/solr"
+RUN tar zxvf "${ROOT}/solr/solr-${SOLR_VERSION}.tgz" --directory "${ROOT}/solr"
+
+# Set up VENV.
+ENV VIRTUAL_ENV=${ROOT}/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Install requirements from the lockfile.
+COPY --chown=nru requirements.txt ${ROOT}
+RUN pip3 install -r requirements.txt
+
+# Copy necessary files.
+COPY --chown=nru csv2json.py ${ROOT}
+COPY --chown=nru setup.sh ${ROOT}
+COPY --chown=nru README.md ${ROOT}
+COPY --chown=nru Makefile ${ROOT}
+
+# On entry, start the Solr instance.
+ENV SOLR_EXEC="${ROOT}/solr/solr-${SOLR_VERSION}/bin/solr"
+ENV SOLR_DIR="$SOLR_DIR"
+ENTRYPOINT ${SOLR_EXEC} -cloud -f -p 8983 -m 64G -s ${SOLR_DIR}
diff --git a/data-loading/Makefile b/data-loading/Makefile
@@ -0,0 +1,77 @@
+# This Makefile contains all the instructions necessary to
+# download Babel synonym files from a web location, create
+# and load them into a Solr dataset and generate a Solr backup
+# that can be used to start a NameRes instance.
+#
+
+# Configuration
+SYNONYMS_URL=https://stars.renci.org/var/babel_outputs/2022dec2-2/synonyms/
+
+# How much memory should Solr use.
+SOLR_MEM=64G
+
+# Where should the Solr PID file be created?
+SOLR_PID=data/solr.pid
+
+# SOLR_DIR should be set up to point to the Solr data directory (usually /var/solr)
+# and SOLR_EXEC should be set up to point to the Solr executable.
+# These will both be set up by the Dockerfile.
+
+# All and clean targets.
+
+.PHONY: all clean
+all: 
+	echo Done.
+
+clean:
+	rm -rf data/*
+
+# This is a three step process.
+#
+# Step 1. Download an uncompress synonym files.
+data/synonyms/done:
+	wget -c -r -l1 -nd -P data/synonyms ${SYNONYMS_URL}
+	gunzip data/synonyms/*.gz
+	touch $@
+	$(info Downloaded synonyms from ${SYNONYMS_URL})
+
+# Step 2. Convert synonym files to JSON files.
+CSV2JSON = python csv2json.py
+SYNONYM_FILES = $(wildcard data/synonyms/*.txt)
+JSON_FILES = $(addprefix data/json/,$(notdir $(SYNONYM_FILES:.txt=.json)))
+data/json/%.json: data/synonyms/%.txt
+	mkdir -p data/json
+	$(CSV2JSON) $< $@ --sep "\t"
+	$(info Converted $< to $@ using ${CSV2JSON}.)
+
+data/json/done: data/synonyms/done ${JSON_FILES}
+	touch data/json/done
+	$(info Converted CSV files to ${JSON_FILES})
+
+# Step 3. Start Solr server.
+${SOLR_PID}:
+	mkdir -p ${SOLR_DIR}/logs
+	${SOLR_EXEC} -cloud -p 8983 -v -m ${SOLR_MEM} -s ${SOLR_DIR} >> ${SOLR_DIR}/logs/solr.txt 2>> ${SOLR_DIR}/logs/solr.err.txt
+	while [ ! -s ${SOLR_PID} ]; do \
+		${SOLR_EXEC} status | grep -Po 'Solr process \K([0-9]+)' > ${SOLR_PID}; \
+	done
+	$(info Solr started with PID file at ${SOLR_PID})
+	cat ${SOLR_PID}
+
+# Step 4. Load JSON files into Solr server.
+data/setup-complete: data/json/done ${SOLR_PID}
+	mkdir -p data/logs
+	bash setup.sh "data/json/*.json" >> data/logs/setup.sh.log 2>> data/logs/setup.sh.err.log && touch $@
+
+data/start-solr-backup: data/setup-complete
+	curl 'http://localhost:8983/solr/name_lookup/replication?command=backup&name=backup' && touch $@
+
+# Step 5. Wait for the backup to complete.
+data/check-solr-backup:
+	curl 'http://localhost:8983/solr/name_lookup/replication?command=details'
+
+.PHONY: stop-solr
+stop-solr:
+	rm ${SOLR_PID}
+	${SOLR_EXEC} stop
+	$(info Solr stopped.)
diff --git a/data-loading/README.md b/data-loading/README.md
@@ -0,0 +1,86 @@
+# Loading NameResolution data
+
+NameResolution data needs to be loaded as a compressed [Apache Solr](https://solr.apache.org/) database.
+To create this dataset is a three-step process.
+
+1. Set up a Solr server locally. The easiest way to do this is via Docker:
+
+   ```shell
+   $ docker run -v "$PWD/data/solrdata:/var/solr" --name name_lookup -p 8983:8983 -t solr -cloud -p 8983 -m 12G
+   ```
+
+   You can adjust the `12G` to increase the amount of memory available to Solr. You can also add `-d` to the
+   Docker arguments if you would like to run this node in the background.
+
+2. Copy the synonym files into the `data/synonyms` directory. Synonym files that are too large will
+   need to split it into smaller files. (`gsplit` is the GNU version of `split`, which includes support
+   for adding an additional suffix to files)
+
+   ```shell
+   $ gsplit -l 5000000 -d --additional-suffix .txt SmallMolecule.txt SmallMolecule
+   $ gsplit -l 5000000 -d --additional-suffix .txt MolecularMixture.txt MolecularMixture
+   ```
+
+3. Convert all the synonym text files into JSON document. To do this, you need to use the `csv2json.py` script
+   included in this directory. By default, the Makefile expects the synonym files to be present in `data/synonyms`
+   and writes out JSON files to `data/json`.
+
+   ```shell
+   $ pip install -r requirements.txt
+   $ make
+   ```
+
+4. Load the JSON files into the Solr database by running:
+
+   ```shell
+   $ ./setup.sh "data/json/*.json"
+   ```
+
+   Note the double-quotes: setup.sh requires a glob pattern as its first argument, not a list of files to process!
+
+5. Generate a backup of the Solr instance. The first command will create a directory at
+   `solrdata/data/name_lookup_shard1_repical_n1/data/snapshot.backup` -- you can track its progress by comparing the
+   number of files in that directory to the number of files in `../data/index` (as I write this, it has 513 files).
+
+   ```shell
+   $ curl 'http://localhost:8983/solr/name_lookup/replication?command=backup&name=backup'
+   $ curl 'http://localhost:8983/solr/name_lookup/replication?command=details'
+   ```
+
+   Once the backup is complete, you'll see a part of the `details` response that looks like this:
+
+   ```json
+   "backup":{
+      "startTime":"2022-09-13T18:42:43.678219123Z",
+      "fileCount":512,
+      "indexFileCount":512,
+      "status":"success",
+      "snapshotCompletedAt":"2022-09-13T19:36:00.599797304Z",
+      "endTime":"2022-09-13T19:36:00.599797304Z",
+      "snapshotName":"backup",
+      "directoryName":"snapshot.backup"}
+   }
+   ```
+
+6. Shutdown the Solr instance.
+
+   ```shell
+   $ docker exec name_lookup solr stop -p 8983 -verbose
+   ```
+
+7. Generate the backup tarball. At the moment, this is expected to be in the format
+   `var/solr/data/snapshot.backup/[index files]`. The easiest way to generate this tarball correctly is to run:
+
+   ```shell
+   $ mkdir -p data/var/solr/data
+   $ mv /var/solr/name_lookup_shard1_replica_n1/data/snapshot.backup data/var/solr/data
+   $ cd data
+   $ tar zcvf snapshot.backup.tar.gz var
+   ```
+
+8. Publish `snapshot.backup.tar.gz` to a publicly-accessible URL.
+
+9. Use the instructions at https://github.com/helxplatform/translator-devops/tree/develop/helm/name-lookup to set up an
+   instance of NameRes that downloads snapshot.backup.tar.gz from this publicly-accessible URL.
+
+The Makefile included in this directory contains targets for more of these steps.
diff --git a/data-loading/csv2json.py b/data-loading/csv2json.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+"""Convert CSV to JSON for upload to Solr."""
+import argparse
+import csv
+import json
+
+from tqdm import tqdm
+
+
+def reformat(
+        infile: str,
+        outfile: str,
+        separator: str = ",",
+        quotechar: str = "🤪",
+):
+    """Reformat name file."""
+    with open(infile, "r") as f:
+        reader = csv.reader(f, delimiter=separator, quotechar=quotechar)
+        with open(outfile, "w") as f:
+            f.write("[\n")
+            for idx, row in tqdm(enumerate(reader)):
+                entity_id = ":".join(row[0].split("/")[-1].split("_"))
+                name = row[-1]
+                if idx:
+                    f.write(",\n")
+                f.write("    ")
+                json.dump({
+                    "curie": entity_id,
+                    "name": name,
+                    "length": len(name)
+                }, f)
+            f.write("\n]")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="reformat a name file"
+    )
+    parser.add_argument(
+        "input",
+        type=str,
+        help="the input file path",
+    )
+    parser.add_argument(
+        "output",
+        type=str,
+        help="the output file path",
+    )
+    parser.add_argument(
+        "--sep",
+        type=str,
+        help="the input file field separator",
+        default=",",
+    )
+    parser.add_argument(
+        "--quote",
+        type=str,
+        help="the input file quote character",
+        default="🤪",
+    )
+
+    args = parser.parse_args()
+    args.sep = args.sep.replace("\\t", "\t")
+    reformat(args.input, args.output, args.sep, args.quote)
+
diff --git a/data-loading/kubernetes/nameres-loading-data.k8s.yaml b/data-loading/kubernetes/nameres-loading-data.k8s.yaml
@@ -0,0 +1,24 @@
+# Kubernetes file for setting up a PVC to use for nameres-Loading.
+# nameres-loading-data is a directory for storing synonym files,
+# the generated Solr back and its compressed version.
+#
+# As of 2022dec11, this directory needs to contain:
+# - 38G of synonym files (in CSV and JSON)
+# - 30G of snapshot.backup files moved here from Solr
+# - 23G of snapshot.backup.tar.gz after compressing
+# Therefore it needs to be a minimum of 100G. I'm going to set a
+# size of 150G in case we need some extra space.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: nameres-loading-data
+  labels:
+    app: nameres-loading
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 150Gi
+  storageClassName: basic
diff --git a/data-loading/kubernetes/nameres-loading-solr.k8s.yaml b/data-loading/kubernetes/nameres-loading-solr.k8s.yaml
@@ -0,0 +1,18 @@
+# Kubernetes file for setting up a PVC to use for nameres-loading.
+# 
+# As of 2022dec11, this seems to come to 37G for files + 30G for snapshot.backup.
+# I'm going to set the size to 150Gi so we have a bit of spare space if needed.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: nameres-loading-solr
+  labels:
+    app: nameres-loading
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 150Gi
+  storageClassName: basic