TranslatorSRI · gaurav · Jun 12, 2023 · Jan 26, 2023 · Jan 26, 2023 · Apr 19, 2023
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,9 @@
+# We don't need the .git or .idea directories.
+/.git/
+/.idea/
+
+# Ignore the venv directory when preparing Dockerfile.
+/venv/
+
+# I sometimes use data-loading/data for testing the data-loading code.
+/data-loading/data/
diff --git a/.github/workflows/tester.yml b/.github/workflows/tester.yml
@@ -32,8 +32,8 @@ jobs:
       - name: Install pytest
         run: pip install pytest
 
-      - name: Set up tests
-        run: ./setup.sh tests/data/test-synonyms.json
+      - name: NameRes Data Loading test.
+        run: ./data-loading/setup-and-load-solr.sh tests/data/test-synonyms.json
 
       - name: Run the tests
         run: |

diff --git a/Dockerfile b/Dockerfile
@@ -1,33 +1,35 @@
 # leverage the renci python base image
-FROM renciorg/renci-python-image:v0.0.1
-
-#Set the branch
-ARG BRANCH_NAME=main
+FROM renciorg/renci-python-image:latest
 
 # install basic tools
-RUN apt-get update
+RUN apt update
+RUN apt upgrade -y
+
+# Make a home directory for the non-root user.
+RUN mkdir /home/nru
+RUN chown nru /home/nru
 
 # make a directory for the repo
 RUN mkdir /repo
 
 # go to the directory where we are going to upload the repo
 WORKDIR /repo
+RUN mkdir NameResolution
+RUN chown nru NameResolution
+USER nru
 
-# get the latest code
-RUN git clone --branch $BRANCH_NAME --single-branch https://github.com/TranslatorSRI/NameResolution.git
+# add the current code
+COPY . /repo/NameResolution
 
 # go to the repo dir
 WORKDIR /repo/NameResolution
 
 # install requirements
+ENV PATH="${PATH}:/home/nru/.local/bin"
 RUN pip install -r requirements.txt
 
 # expose the default port
 EXPOSE 2433
 
-RUN chmod 777 -R .
-
-USER nru
-
 # start the service entry point
-ENTRYPOINT ["bash", "main.sh"]
+ENTRYPOINT ["bash", "main.sh"]
diff --git a/README.md b/README.md
@@ -16,12 +16,7 @@ See the documentation [notebook](documentation/NameResolution.ipynb) for example
 
 ### Solr database
 
-```bash
-docker run --name name_lookup -d -p 8983:8983 -t solr -DzkRun
-docker run -it -v /local/NameResolution/data:/json -v /projects/datatrans/synonyms/March17_2021/:/csv -v /local/NameResolution:/NameResolution python:3.9 bash
-    ./csv2json.py /csv/anatomy.txt_synonyms.txt /json --sep "\t"
-./setup.sh "/local/NameResolution/data/*.json"
-```
+See instructions in the `data-loading/` directory.
 
 ### API
 

diff --git a/data-loading/.dockerignore b/data-loading/.dockerignore
@@ -1 +1,6 @@
-dev
+# Ignore venv file is there is one.
+/venv/
+
+
+# I sometimes use data-loading/data for testing the data-loading code.
+/data/
diff --git a/data-loading/Dockerfile b/data-loading/Dockerfile
@@ -67,8 +67,7 @@ COPY --chown=nru requirements.txt ${ROOT}
 RUN pip3 install -r requirements.txt
 
 # Copy necessary files.
-COPY --chown=nru csv2json.py ${ROOT}
-COPY --chown=nru setup.sh ${ROOT}
+COPY --chown=nru setup-and-load-solr.sh ${ROOT}
 COPY --chown=nru README.md ${ROOT}
 COPY --chown=nru Makefile ${ROOT}
 

diff --git a/data-loading/Makefile b/data-loading/Makefile
@@ -35,19 +35,6 @@ data/synonyms/done:
 	touch $@
 	$(info Downloaded synonyms from ${SYNONYMS_URL})
 
-# Step 2. Convert synonym files to JSON files.
-CSV2JSON = python csv2json.py
-SYNONYM_FILES = $(wildcard data/synonyms/*.txt)
-JSON_FILES = $(addprefix data/json/,$(notdir $(SYNONYM_FILES:.txt=.json)))
-data/json/%.json: data/synonyms/%.txt
-	mkdir -p data/json
-	$(CSV2JSON) $< $@ --sep "\t"
-	$(info Converted $< to $@ using ${CSV2JSON}.)
-
-data/json/done: data/synonyms/done ${JSON_FILES}
-	touch data/json/done
-	$(info Converted CSV files to ${JSON_FILES})
-
 # Step 3. Start Solr server.
 ${SOLR_PID}:
 	mkdir -p ${SOLR_DIR}/logs
@@ -59,9 +46,9 @@ ${SOLR_PID}:
 	cat ${SOLR_PID}
 
 # Step 4. Load JSON files into Solr server.
-data/setup-complete: data/json/done ${SOLR_PID}
+data/setup-complete: data/synonyms/done ${SOLR_PID}
 	mkdir -p data/logs
-	bash setup.sh "data/json/*.json" >> data/logs/setup.sh.log 2>> data/logs/setup.sh.err.log && touch $@
+	bash setup-and-load-solr.sh "data/synonyms/*.txt" >> data/logs/setup-and-load-solr.sh.log 2>> data/logs/setup-and-load-solr.sh.err.log && touch $@
 
 data/start-solr-backup: data/setup-complete
 	curl 'http://localhost:8983/solr/name_lookup/replication?command=backup&name=backup' && touch $@

diff --git a/data-loading/README.md b/data-loading/README.md
@@ -21,9 +21,7 @@ To create this dataset is a three-step process.
    $ gsplit -l 5000000 -d --additional-suffix .txt MolecularMixture.txt MolecularMixture
    ```
 
-3. Convert all the synonym text files into JSON document. To do this, you need to use the `csv2json.py` script
-   included in this directory. By default, the Makefile expects the synonym files to be present in `data/synonyms`
-   and writes out JSON files to `data/json`.
+3. Download all the synonym text files into the `data/json` folder. You can download this by running `make`.
 
    ```shell
    $ pip install -r requirements.txt
@@ -33,10 +31,10 @@ To create this dataset is a three-step process.
 4. Load the JSON files into the Solr database by running:
 
    ```shell
-   $ ./setup.sh "data/json/*.json"
+   $ ./setup-and-load-solr.sh "data/json/*.json"
    ```
 
-   Note the double-quotes: setup.sh requires a glob pattern as its first argument, not a list of files to process!
+   Note the double-quotes: setup-and-load-solr.sh requires a glob pattern as its first argument, not a list of files to process!
 
 5. Generate a backup of the Solr instance. The first command will create a directory at
    `solrdata/data/name_lookup_shard1_repical_n1/data/snapshot.backup` -- you can track its progress by comparing the

diff --git a/data-loading/csv2json.py b/data-loading/csv2json.py
diff --git a/data-loading/kubernetes/nameres-loading-data.k8s.yaml b/data-loading/kubernetes/nameres-loading-data.k8s.yaml
@@ -20,5 +20,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 150Gi
+      storage: 300Gi
   storageClassName: basic
diff --git a/data-loading/kubernetes/nameres-loading-solr.k8s.yaml b/data-loading/kubernetes/nameres-loading-solr.k8s.yaml
@@ -14,5 +14,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 150Gi
+      storage: 300Gi
   storageClassName: basic
diff --git a/setup.sh → data-loading/setup-and-load-solr.sh b/setup.sh → data-loading/setup-and-load-solr.sh
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,6 @@
-requestsfastapihttpxuvicornpyyamljsonlines
+requests
+fastapi
+httpx
+uvicorn
+pyyaml
+jsonlines
diff --git a/src/csv2json.py b/src/csv2json.py