# Upload Taxon Data From GBIF BackBone to PyTigerGraph Schema

Note: Best way to quickly get started is to run on GoogleColab and connect to your local TGCloud

## Install dependencies and get data

In [None]:
# install latest version of pyTigerGraph (0.0.9.9.2)
!pip install -U pyTigerGraph

In [None]:
# download data into local dir
!wget https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip
!unzip backbone.zip
!rm backbone.zip

In [None]:
# see where our server is; for Google Colab, this can vary depending on what is available
!curl ipinfo.io

## Connect to Graph

In [None]:
# Imports
import pyTigerGraph as tg

# fill in the below with your Tiger Graph credentials
HOSTNAME = "<ENTER_HOSTNAME>"
USERNAME = "<ENTER_USERNAME>"
PASSWORD = "<ENTER_PASSWORD>"

# connect to pytigerGraph
conn = tg.TigerGraphConnection(
    host=HOSTNAME, 
    username=USERNAME, 
    password=PASSWORD, 
    gsqlVersion="3.0.5", 
    )

In [None]:
# to see all nodes
conn.gsql('LS')

In [None]:
# to also see all the databases
conn.gsql('USE GLOBAL LS')

In [None]:
# make sure graph is empty; clear everything
conn.gsql('''USE GLOBAL DROP ALL''')

## Define & Publish Schema

### Create Vertexes

In [None]:
print(conn.gsql('''
    CREATE VERTEX Taxon (
        PRIMARY_ID taxonID UINT,
        scientificName STRING,
        canonicalName STRING,
        genericName STRING,
        specificEpithet STRING
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX TaxonRank (
        PRIMARY_ID rank STRING
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX TaxonomicStatus (
        PRIMARY_ID status STRING
        ) WITH primary_id_as_attribute="true"
    
    CREATE VERTEX Distribution (
        PRIMARY_ID distributionID UINT
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX EstablishmentMeans (
        PRIMARY_ID establishmentMeans STRING
    ) WITH primary_id_as_attribute="true"

    CREATE VERTEX OccurrenceStatus (
        PRIMARY_ID occurrenceStatus STRING
    ) WITH primary_id_as_attribute="true"

    CREATE VERTEX LocationID (
        PRIMARY_ID locationID STRING
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX Locality (
        PRIMARY_ID locality STRING
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX Country (
        PRIMARY_ID country STRING,
        countryCode STRING
        ) WITH primary_id_as_attribute="true"

    CREATE VERTEX Player (
        PRIMARY_ID id STRING,
        name STRING,
        level UINT,
        score UINT
    ) WITH primary_id_as_attribute="true"
    
    CREATE VERTEX Guild (
        PRIMARY_ID name STRING
    ) WITH primary_id_as_attribute="true"
    '''))

### Create Edges

In [None]:
print(conn.gsql('''
    CREATE DIRECTED EDGE parent_taxon_id (FROM Taxon, TO Taxon)
    CREATE DIRECTED EDGE child_taxon_id (FROM Taxon, TO Taxon)

    CREATE DIRECTED EDGE accepted_name_id (FROM Taxon, TO Taxon)
    CREATE DIRECTED EDGE original_name_id (FROM Taxon, TO Taxon)
    CREATE UNDIRECTED EDGE taxon_rank (FROM Taxon, TO TaxonRank)
    CREATE UNDIRECTED EDGE taxon_status (FROM Taxon, TO TaxonomicStatus)

    CREATE UNDIRECTED EDGE taxon_distribution (FROM Taxon, TO Distribution)
    CREATE UNDIRECTED EDGE distribution_establishment (FROM Distribution, TO EstablishmentMeans)
    CREATE UNDIRECTED EDGE distribution_occurrence (FROM Distribution, TO OccurrenceStatus)
    CREATE UNDIRECTED EDGE distribution_locationid (FROM Distribution, TO LocationID)
    CREATE UNDIRECTED EDGE distribution_locality (FROM Distribution, TO Locality)
    CREATE UNDIRECTED EDGE distribution_country (FROM Distribution, TO Country)

    CREATE UNDIRECTED EDGE sighting (
        FROM Player, 
        TO Taxon, 
        latitude FLOAT, 
        longitude FLOAT,
        sightingTime DATETIME,
        img_store_url STRING
        )
    
    CREATE UNDIRECTED EDGE player_guild (FROM Player, TO Guild, join_date DATETIME)
    ''', options=[]))

### Create Graph

In [None]:
GRAPH_NAME = 'taxonomy2'

In [None]:
print(conn.gsql(f'''
   CREATE GRAPH {GRAPH_NAME}(
        Taxon, TaxonRank, TaxonomicStatus, parent_taxon_id, child_taxon_id, accepted_name_id, original_name_id, taxon_rank, taxon_status,
        Distribution, EstablishmentMeans, OccurrenceStatus, LocationID, Locality, Country, Player, Guild,
        taxon_distribution, distribution_establishment, distribution_occurrence, distribution_locationid, distribution_locality, distribution_country,
        sighting, player_guild
        )
    '''))

In [None]:
conn.gsql(f'USE GRAPH {GRAPH_NAME}')

Securely connect to newly created graph

In [None]:
secret = conn.createSecret()
authToken = conn.getToken(secret)

# securely connect to graph we created
conn = tg.TigerGraphConnection(
    host=HOSTNAME, 
    username=USERNAME, 
    password=PASSWORD, 
    graphname=GRAPH_NAME, 
    apiToken=authToken[0],
    gsqlVersion="3.0.5",
    useCert=True
    )


## Create Loading Jobs

Create Loading jobs for Taxon TSV file

In [None]:
results = conn.gsql('''
    USE GRAPH taxonomy2
    BEGIN
    CREATE LOADING JOB load_taxon_csv FOR GRAPH taxonomy2 {
        DEFINE FILENAME MyDataSource;
        LOAD MyDataSource TO VERTEX Taxon VALUES($0, $4, $5, $6, $7) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO VERTEX TaxonRank VALUES($8) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO VERTEX TaxonomicStatus VALUES($9) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";

        LOAD MyDataSource TO EDGE parent_taxon_id VALUES($0, $1) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE child_taxon_id VALUES($1, $0) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";

        LOAD MyDataSource TO EDGE accepted_name_id VALUES($0, $2) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE original_name_id VALUES($0, $3) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE taxon_rank VALUES($0, $8) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE taxon_status VALUES($0, $9) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
  }
  END''')
print(results)

Loading job for Distributions TSV file

In [None]:
results = conn.gsql('''
  USE GRAPH taxonomy2
  BEGIN
  CREATE LOADING JOB load_distributions_csv FOR GRAPH taxonomy2 {
    DEFINE FILENAME MyDataSource;
    LOAD MyDataSource TO VERTEX Distribution VALUES($0) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO VERTEX LocationID VALUES($2) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO VERTEX Locality VALUES($3) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO VERTEX Country VALUES($4, $5) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO VERTEX EstablishmentMeans VALUES($6) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO VERTEX OccurrenceStatus VALUES($7) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";

    LOAD MyDataSource TO EDGE taxon_distribution VALUES($1, $0) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO EDGE distribution_establishment VALUES($0, $6) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO EDGE distribution_occurrence VALUES($0, $7) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO EDGE distribution_locationid VALUES($0, $2) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO EDGE distribution_locality VALUES($0, $3) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
    LOAD MyDataSource TO EDGE distribution_country VALUES($0, $4) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
  }
  END''')
print(results)

## Preprocess Data

Species Lookup: https://www.gbif.org/species/0



In [None]:
!pip install pandas==1.3.5
!pip install numpy==1.21.5
!pip install tqdm==4.63.1

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
# pd.set_option('max_columns', None) # display all cols

DATA_DIR = './backbone/'

### Upload Taxon TSV file

First, load and pre-process

In [None]:
## Taxon file
# the datafile that has 90% of the information is 'Taxon.tsv'
taxon_df = pd.read_csv(os.path.join(DATA_DIR, 'Taxon.tsv'), sep='\t', header=0) 

# keep columns relevent to us
taxon_df = taxon_df[['taxonID', 'parentNameUsageID', 'acceptedNameUsageID',
       'originalNameUsageID', 'scientificName', 'canonicalName', 
       'genericName', 'specificEpithet', 'taxonRank', 'taxonomicStatus'
       ]]

# fill nan id with -1 and cast to int
for column_name in ['parentNameUsageID', 'acceptedNameUsageID', 'originalNameUsageID']:
    taxon_df[column_name] = taxon_df[column_name].fillna(-1)
    taxon_df[column_name] = taxon_df[column_name].astype(int)

# fill rest of nan with '' (empty string)
taxon_df = taxon_df.fillna('')

In [None]:
display(taxon_df.head(1))
taxon_df.shape

Here we have to split into smaller chunks and upload, otherwise we encounter upload errors.

In [None]:
# split into chunks
chunked_taxon_df = np.array_split(taxon_df, 100) 

# temp savedir for our chunk
processed_taxonfilepath = './processed_taxon.csv' 

# upload one-by-one: save chunk to csv, upload csv, repeat
for idx, chunk_taxon_df in tqdm(enumerate(chunked_taxon_df)):
    # save chunk as csv file and upload
    chunk_taxon_df.to_csv(processed_taxonfilepath, index=False, header=True)
    results = conn.uploadFile(processed_taxonfilepath, fileTag='MyDataSource', jobName='load_taxon_csv')
os.remove(processed_taxonfilepath)

In [None]:
# check our vertices
print("Vertex Counts")
print(conn.getVertexCount("*")) # {'Taxon': 6933649, 'TaxonomicStatus': 7, 'TaxonRank': 12}
print("Edge Counts")
print(conn.getEdgeCount("*")) # {'parent_taxon_id': 6921442, 'child_taxon_id': 6921442, 'accepted_name_id': 6921442, 'original_name_id': 6921442, 'taxon_rank': 6921442, 'taxon_status': 6921442}

Looks like all the Taxons were uploaded. Fantastic!

In [None]:
# see 7 taxonomic status'
print(taxon_df['taxonomicStatus'].value_counts()) # ['accepted', 'doubtful', 'synonym', 'proparte synonym', 'homotypic synonym', 'heterotypic synonym']
print('-'*40)
# and 12 taxon rankings
print(taxon_df['taxonRank'].value_counts()) # ['genus', 'unranked', 'family', 'kingdom', 'phylum', 'species', 'class', 'order', 'variety', 'subspecies', 'form']

In [None]:
# once we are happy that our vertices are uploaded, drop taxon df to reclaim some memory
del taxon_df
del chunk_taxon_df

##### Upload Distributions File
Follow the same procedure

In [None]:
# load distributions file
distributions_df = pd.read_csv(os.path.join(DATA_DIR, 'Distribution.tsv'), sep='\t', header=0, on_bad_lines='skip')

# take only necessary columns
distributions_df = distributions_df[['taxonID', 'locationID', 'locality' ,'country', 'countryCode', 'establishmentMeans', 'occurrenceStatus']]

# fill taxonids
distributions_df['taxonID'] = distributions_df['taxonID'].fillna(-1)
distributions_df['taxonID'] = distributions_df['taxonID'].astype(int)

# fill rest of nan with 'unknown'
distributions_df = distributions_df.fillna('UNK')

assert distributions_df.isna().sum().sum() == 0

# reset index
distributions_df = distributions_df.reset_index()

print(distributions_df.shape)
distributions_df.sample(2)

Functions for cleaning text from distributions.

In [None]:
import re

def remove_whitespace(text):
    return  " ".join(text.split())

def remove_tag(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def ensure_alphanum(text):
    return re.sub(r"[^a-zA-Z0-9 ]", "", text)

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocess_input_text(text):
    text = remove_urls(text)
    text = remove_tag(text)
    text = remove_whitespace(text)
    return text


In [None]:
# process the texts of these
distributions_df['locationID'] = distributions_df['locationID'].apply(preprocess_input_text)
distributions_df['locality'] = distributions_df['locality'].apply(preprocess_input_text)
distributions_df['country'] = distributions_df['country'].apply(preprocess_input_text)

In [None]:
secret = conn.createSecret()
authToken = conn.getToken(secret)

# securely connect to graph we created
conn = tg.TigerGraphConnection(
    host=HOSTNAME, 
    username=USERNAME, 
    password=PASSWORD, 
    graphname=GRAPH_NAME, 
    apiToken=authToken[0],
    gsqlVersion="3.0.5",
    useCert=True
    )

In [None]:
# split into chunks as we did with taxon above
import time
chunked_df = np.array_split(distributions_df, 490) 
    
processed_filepath = './processed.csv' # temp savedir for our chunk
# upload one-by-one: save chunk to csv, upload csv, repeat
# note; may need a very beefy TG cloud instance; otherwise there is a big fear or running out of memory
# which is what happened to me with an instance with 16gb ram
for idx, cdf in tqdm(enumerate(chunked_df)):
    
    # stop upload early to leave some memory free for queries later on
    if idx==100:
        break

    cdf.to_csv(processed_filepath, index=False, header=True)
    results = conn.uploadFile(processed_filepath, fileTag='MyDataSource', jobName='load_distributions_csv')
os.remove(processed_filepath)

In [None]:
# again, reclaim some memory
del distributions_df
del cdf

In [None]:
# check our vertices
print("Vertex Counts")
print(conn.getVertexCount("*")) # {'Taxon': 6933651, 'TaxonomicStatus': 7, 'TaxonRank': 12, 'EstablishmentMeans': 4, 'Distribution': 16217, 'OccurrenceStatus': 6, 'LocationID': 15, 'Locality': 17, 'Country': 4, 'Player': 0}
print("Edge Counts")
print(conn.getEdgeCount("*")) # {'parent_taxon_id': 6921442, 'child_taxon_id': 6921442, 'accepted_name_id': 6921442, 'original_name_id': 6921442, 'taxon_rank': 6921442, 'taxon_status': 6921442, 'taxon_distribution': 11702, 'distribution_establishment': 18190, 'distribution_occurrence': 18190, 'distribution_locationid': 18190, 'distribution_locality': 18190, 'distribution_country': 18190, 'sighting': 0}

### Querying
To continue in the next notebook!

In [None]:
results = conn.getVerticesById("Taxon", "10109647")
print(results)

In [None]:
results = conn.getEdgeCountFrom("Taxon", "10109647")
print(results)

In [None]:
results = conn.getEdges("Taxon", "10109647")
print(results)