From 8cd5a73355e3c8282bcee898f2540840e6527d46 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 11 May 2020 13:05:46 -0400
Subject: [PATCH 01/26] Port changes from LDBC branch

---
 redisgraph_bulk_loader/__init__.py      |   7 +
 redisgraph_bulk_loader/bulk_insert.py   | 424 ++++--------------------
 redisgraph_bulk_loader/configs.py       |   9 +
 redisgraph_bulk_loader/entity_file.py   | 194 +++++++++++
 redisgraph_bulk_loader/exceptions.py    |   7 +
 redisgraph_bulk_loader/label.py         |  65 ++++
 redisgraph_bulk_loader/query_buffer.py  |  72 ++++
 redisgraph_bulk_loader/relation_type.py |  80 +++++
 8 files changed, 500 insertions(+), 358 deletions(-)
 create mode 100644 redisgraph_bulk_loader/configs.py
 create mode 100644 redisgraph_bulk_loader/entity_file.py
 create mode 100644 redisgraph_bulk_loader/exceptions.py
 create mode 100644 redisgraph_bulk_loader/label.py
 create mode 100644 redisgraph_bulk_loader/query_buffer.py
 create mode 100644 redisgraph_bulk_loader/relation_type.py

diff --git a/redisgraph_bulk_loader/__init__.py b/redisgraph_bulk_loader/__init__.py
index 5cdcdf5..4dd3fd0 100644
--- a/redisgraph_bulk_loader/__init__.py
+++ b/redisgraph_bulk_loader/__init__.py
@@ -1,3 +1,10 @@
+from .label import Label
+from .relation_type import RelationType
+from .query_buffer import QueryBuffer
+from .exceptions import (
+        CSVError,
+        SchemaError
+)
 from redisgraph_bulk_loader import bulk_insert
 
 __all__ = [
diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 5cd77ec..0ec542b 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -1,342 +1,69 @@
-import csv
-import os
-import io
 import sys
-import math
-import struct
-import json
 from timeit import default_timer as timer
 import redis
 import click
+import configs
+import query_buffer as QueryBuffer
+from label import Label
+from relation_type import RelationType
 
-# Global variables
-CONFIGS = None         # thresholds for batching Redis queries
-NODE_DICT = {}         # global node dictionary
-TOP_NODE_ID = 0        # next ID to assign to a node
-QUERY_BUF = None       # Buffer for query being constructed
-QUOTING = None
 
-FIELD_TYPES = None
-
-# Custom error class for invalid inputs
-class CSVError(Exception):
-    pass
-
-# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
-class Type:
-    NULL = 0
-    BOOL = 1
-    NUMERIC = 2
-    STRING = 3
-
-# User-configurable thresholds for when to send queries to Redis
-class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges):
-        # Maximum number of tokens per query
-        # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
-        # that we can safely ignore tokens that aren't binary strings
-        # ("GRAPH.BULK", "BEGIN", graph name, counts)
-        self.max_token_count = min(max_token_count, 1024 * 1023)
-        # Maximum size in bytes per query
-        self.max_buffer_size = max_buffer_size * 1000000
-        # Maximum size in bytes per token
-        # 512 megabytes is a hard-coded Redis maximum
-        self.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
-
-        self.skip_invalid_nodes = skip_invalid_nodes
-        self.skip_invalid_edges = skip_invalid_edges
-
-# QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
-class QueryBuffer(object):
-    def __init__(self, graphname, client):
-        # Redis client and data for each query
-        self.client = client
-
-        # Sizes for buffer currently being constructed
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # The first query should include a "BEGIN" token
-        self.graphname = graphname
-        self.initial_query = True
-
-        self.node_count = 0
-        self.relation_count = 0
-
-        self.labels = [] # List containing all pending Label objects
-        self.reltypes = [] # List containing all pending RelationType objects
-
-        self.nodes_created = 0 # Total number of nodes created
-        self.relations_created = 0 # Total number of relations created
-
-    # Send all pending inserts to Redis
-    def send_buffer(self):
-        # Do nothing if we have no entities
-        if self.node_count == 0 and self.relation_count == 0:
-            return
-
-        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
-        # Prepend a "BEGIN" token if this is the first query
-        if self.initial_query:
-            args.insert(0, "BEGIN")
-            self.initial_query = False
-
-        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
-        stats = result.split(', '.encode())
-        self.nodes_created += int(stats[0].split(' '.encode())[0])
-        self.relations_created += int(stats[1].split(' '.encode())[0])
-
-        self.clear_buffer()
-
-    # Delete all entities that have been inserted
-    def clear_buffer(self):
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # All constructed entities have been inserted, so clear buffers
-        self.node_count = 0
-        self.relation_count = 0
-        del self.labels[:]
-        del self.reltypes[:]
-
-    def report_completion(self, runtime):
-        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
-              % (self.graphname, self.nodes_created, self.relations_created, runtime))
-
-# Superclass for label and relation CSV files
-class EntityFile(object):
-    def __init__(self, filename, separator):
-        # The label or relation type string is the basename of the file
-        self.entity_str = os.path.splitext(os.path.basename(filename))[0]
-        # Input file handling
-        self.infile = io.open(filename, 'rt')
-        # Initialize CSV reader that ignores leading whitespace in each field
-        # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=QUOTING)
-
-        self.prop_offset = 0 # Starting index of properties in row
-        self.prop_count = 0 # Number of properties per entity
-
-        self.packed_header = b''
-        self.binary_entities = []
-        self.binary_size = 0 # size of binary token
-        self.count_entities() # number of entities/row in file.
-
-    # Count number of rows in file.
-    def count_entities(self):
-        self.entities_count = 0
-        self.entities_count = sum(1 for line in self.infile)
-        # discard header row
-        self.entities_count -= 1
-        # seek back
-        self.infile.seek(0)
-        return self.entities_count
-
-    # Simple input validations for each row of a CSV file
-    def validate_row(self, expected_col_count, row):
-        # Each row should have the same number of fields
-        if len(row) != expected_col_count:
-            raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row)))
-
-    # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
-    def reset_partial_binary(self):
-        self.binary_entities = []
-        self.binary_size = len(self.packed_header)
-
-    # Convert property keys from a CSV file header into a binary string
-    def pack_header(self, header):
-        prop_count = len(header) - self.prop_offset
-        # String format
-        entity_bytes = self.entity_str.encode()
-        fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
-        args = [entity_bytes, prop_count]
-        for p in header[self.prop_offset:]:
-            prop = p.encode()
-            fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
-            args.append(prop)
-        return struct.pack(fmt, *args)
-
-    # Convert a list of properties into a binary string
-    def pack_props(self, line):
-        props = []
-        for num, field in enumerate(line[self.prop_offset:]):
-            field_type_idx = self.prop_offset+num
-            try:
-                FIELD_TYPES[self.entity_str][field_type_idx]
-            except:
-                props.append(prop_to_binary(field, None))
-            else:
-                props.append(prop_to_binary(field, FIELD_TYPES[self.entity_str][field_type_idx]))
-        return b''.join(p for p in props)
-
-    def to_binary(self):
-        return self.packed_header + b''.join(self.binary_entities)
-
-# Handler class for processing label csv files.
-class Label(EntityFile):
-    def __init__(self, infile, separator):
-        super(Label, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header(self):
-        # Header format:
-        # node identifier (which may be a property key), then all other property keys
-        header = next(self.reader)
-        expected_col_count = len(header)
-        # If identifier field begins with an underscore, don't add it as a property.
-        if header[0][0] == '_':
-            self.prop_offset = 1
-        self.packed_header = self.pack_header(header)
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
-
-    def process_entities(self, expected_col_count):
-        global NODE_DICT
-        global TOP_NODE_ID
-        global QUERY_BUF
-
-        entities_created = 0
-        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
-            for row in reader:
-                self.validate_row(expected_col_count, row)
-                # Add identifier->ID pair to dictionary if we are building relations
-                if NODE_DICT is not None:
-                    if row[0] in NODE_DICT:
-                        sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
-                                         % (row[0], self.infile.name, self.reader.line_num))
-                        if CONFIGS.skip_invalid_nodes is False:
-                            exit(1)
-                    NODE_DICT[row[0]] = TOP_NODE_ID
-                    TOP_NODE_ID += 1
-                row_binary = self.pack_props(row)
-                row_binary_len = len(row_binary)
-                # If the addition of this entity will make the binary token grow too large,
-                # send the buffer now.
-                if self.binary_size + row_binary_len > CONFIGS.max_token_size:
-                    QUERY_BUF.labels.append(self.to_binary())
-                    QUERY_BUF.send_buffer()
-                    self.reset_partial_binary()
-                    # Push the label onto the query buffer again, as there are more entities to process.
-                    QUERY_BUF.labels.append(self.to_binary())
-
-                QUERY_BUF.node_count += 1
-                entities_created += 1
-                self.binary_size += row_binary_len
-                self.binary_entities.append(row_binary)
-            QUERY_BUF.labels.append(self.to_binary())
-        print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
+def parse_schemas(cls, path_to_csv, csv_tuples):
+    schemas = [None] * (len(path_to_csv) + len(csv_tuples))
+    for idx, in_csv in enumerate(path_to_csv):
+        # Build entity descriptor from input CSV
+        schemas[idx] = cls(in_csv, None)
 
-# Handler class for processing relation csv files.
-class RelationType(EntityFile):
-    def __init__(self, infile, separator):
-        super(RelationType, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
+    offset = len(path_to_csv)
+    for idx, csv_tuple in enumerate(csv_tuples):
+        # Build entity descriptor from input CSV
+        schemas[idx + offset] = cls(csv_tuple[1], csv_tuple[0])
+    return schemas
 
-    def process_header(self):
-        # Header format:
-        # source identifier, dest identifier, properties[0..n]
-        header = next(self.reader)
-        # Assume rectangular CSVs
-        expected_col_count = len(header)
-        self.prop_count = expected_col_count - 2
-        if self.prop_count < 0:
-            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
-                           % (self.infile.name))
 
-        self.prop_offset = 2
-        self.packed_header = self.pack_header(header) # skip src and dest identifiers
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
+# For each input file, validate contents and convert to binary format.
+# If any buffer limits have been reached, flush all enqueued inserts to Redis.
+def process_entities(entities):
+    for entity in entities:
+        entity.process_entities()
+        added_size = entity.binary_size
+        # Check to see if the addition of this data will exceed the buffer's capacity
+        if (QueryBuffer.buffer_size + added_size >= configs.max_buffer_size
+                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
+            # Send and flush the buffer if appropriate
+            QueryBuffer.send_buffer()
+        # Add binary data to list and update all counts
+        QueryBuffer.redis_token_count += len(entity.binary_entities)
+        QueryBuffer.buffer_size += added_size
 
-    def process_entities(self, expected_col_count):
-        entities_created = 0
-        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
-            for row in reader:
-                self.validate_row(expected_col_count, row)
-                try:
-                    src = NODE_DICT[row[0]]
-                    dest = NODE_DICT[row[1]]
-                except KeyError as e:
-                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[0], row[1]))
-                    if CONFIGS.skip_invalid_edges is False:
-                        raise e
-                    continue
-                fmt = "=QQ" # 8-byte unsigned ints for src and dest
-                row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
-                row_binary_len = len(row_binary)
-                # If the addition of this entity will make the binary token grow too large,
-                # send the buffer now.
-                if self.binary_size + row_binary_len > CONFIGS.max_token_size:
-                    QUERY_BUF.reltypes.append(self.to_binary())
-                    QUERY_BUF.send_buffer()
-                    self.reset_partial_binary()
-                    # Push the reltype onto the query buffer again, as there are more entities to process.
-                    QUERY_BUF.reltypes.append(self.to_binary())
 
-                QUERY_BUF.relation_count += 1
-                entities_created += 1
-                self.binary_size += row_binary_len
-                self.binary_entities.append(row_binary)
-            QUERY_BUF.reltypes.append(self.to_binary())
-        print("%d relations created for type '%s'" % (entities_created, self.entity_str))
+def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
+    # Maximum number of tokens per query
+    # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
+    # that we can safely ignore tokens that aren't binary strings
+    # ("GRAPH.BULK", "BEGIN", graph name, counts)
+    configs.max_token_count = min(max_token_count, 1024 * 1023)
+    # Maximum size in bytes per query
+    configs.max_buffer_size = max_buffer_size * 1000000
+    # Maximum size in bytes per token
+    # 512 megabytes is a hard-coded Redis maximum
+    configs.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
 
-# Convert a single CSV property field into a binary stream.
-# Supported property types are string, numeric, boolean, and NULL.
-# type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
-def prop_to_binary(prop_val, type):
-    # All format strings start with an unsigned char to represent our Type enum
-    format_str = "=B"
-    if prop_val is None:
-        # An empty field indicates a NULL property
-        return struct.pack(format_str, Type.NULL)
+    configs.skip_invalid_nodes = skip_invalid_nodes
+    configs.skip_invalid_edges = skip_invalid_edges
+    configs.separator = separator
+    configs.quoting = quoting
 
-    # If field can be cast to a float, allow it
-    if type is None or type == Type.NUMERIC:
-        try:
-            numeric_prop = float(prop_val)
-            if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
-                return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
-        except:
-            pass
 
-    if type is None or type == Type.BOOL:
-        # If field is 'false' or 'true', it is a boolean
-        if prop_val.lower() == 'false':
-            return struct.pack(format_str + '?', Type.BOOL, False)
-        elif prop_val.lower() == 'true':
-            return struct.pack(format_str + '?', Type.BOOL, True)
+def QueryBuf_Set(graphname, client, has_relations):
+    # Redis client and data for each query
+    QueryBuffer.client = client
+    QueryBuffer.graphname = graphname
 
-    if type is None or type == Type.STRING:
-        # If we've reached this point, the property is a string
-        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
-        # Encoding len+1 adds a null terminator to the string
-        format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, Type.STRING, encoded_str)
+    # Create a node dictionary if we're building relations and as such require unique identifiers
+    if has_relations:
+        QueryBuffer.nodes = {}
 
-    ## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
-    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
-
-# For each node input file, validate contents and convert to binary format.
-# If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entity_csvs(cls, csvs, separator):
-    global QUERY_BUF
-    for in_csv in csvs:
-        # Build entity descriptor from input CSV
-        entity = cls(in_csv, separator)
-        added_size = entity.binary_size
-        # Check to see if the addition of this data will exceed the buffer's capacity
-        if (QUERY_BUF.buffer_size + added_size >= CONFIGS.max_buffer_size
-                or QUERY_BUF.redis_token_count + len(entity.binary_entities) >= CONFIGS.max_token_count):
-            # Send and flush the buffer if appropriate
-            QUERY_BUF.send_buffer()
-        # Add binary data to list and update all counts
-        QUERY_BUF.redis_token_count += len(entity.binary_entities)
-        QUERY_BUF.buffer_size += added_size
 
 # Command-line arguments
 @click.command()
@@ -347,39 +74,23 @@ def process_entity_csvs(cls, csvs, separator):
 @click.option('--password', '-a', default=None, help='Redis server password')
 # CSV file paths
 @click.option('--nodes', '-n', required=True, multiple=True, help='Path to node csv file')
+@click.option('--nodes-with-label', '-N', nargs=2, multiple=True, help='Label string followed by path to node csv file')
 @click.option('--relations', '-r', multiple=True, help='Path to relation csv file')
+@click.option('--relations-with-type', '-R', nargs=2, multiple=True, help='Relation type string followed by path to relation csv file')
 @click.option('--separator', '-o', default=',', help='Field token separator in csv file')
 # Buffer size restrictions
 @click.option('--max-token-count', '-c', default=1024, help='max number of processed CSVs to send per query (default 1024)')
 @click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
 @click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
 @click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
-@click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-
-
-def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges):
-    global CONFIGS
-    global NODE_DICT
-    global TOP_NODE_ID
-    global QUERY_BUF
-    global QUOTING
-    global FIELD_TYPES
-
+def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations, relations_with_type, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
-    if field_types is not None:
-        try:
-            FIELD_TYPES = json.loads(field_types)
-        except:
-            raise Exception("Problem parsing field-types. Use the format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) ")
-
-    QUOTING = int(quote)
-
-    TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges)
+    # Initialize configurations with command-line arguments
+    Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -394,7 +105,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         module_list = client.execute_command("MODULE LIST")
         if not any(b'graph' in module_description for module_description in module_list):
             print("RedisGraph module not loaded on connected server.")
-            exit(1)
+            sys.exit(1)
     except redis.exceptions.ResponseError:
         # Ignore check if the connected server does not support the "MODULE LIST" command
         pass
@@ -403,26 +114,23 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     key_exists = client.execute_command("EXISTS", graph)
     if key_exists:
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
-        exit(1)
+        sys.exit(1)
 
-    QUERY_BUF = QueryBuffer(graph, client)
+    QueryBuf_Set(graph, client, relations is not None)
 
-    # Create a node dictionary if we're building relations and as such require unique identifiers
-    if relations:
-        NODE_DICT = {}
-    else:
-        NODE_DICT = None
-
-    process_entity_csvs(Label, nodes, separator)
+    # Read the header rows of each input CSV and save its schema.
+    labels = parse_schemas(Label, nodes, nodes_with_label)
+    reltypes = parse_schemas(RelationType, relations, relations_with_type)
 
-    if relations:
-        process_entity_csvs(RelationType, relations, separator)
+    process_entities(labels)
+    process_entities(reltypes)
 
     # Send all remaining tokens to Redis
-    QUERY_BUF.send_buffer()
+    QueryBuffer.send_buffer()
 
     end_time = timer()
-    QUERY_BUF.report_completion(end_time - start_time)
+    QueryBuffer.report_completion(end_time - start_time)
+
 
 if __name__ == '__main__':
     bulk_insert()
diff --git a/redisgraph_bulk_loader/configs.py b/redisgraph_bulk_loader/configs.py
new file mode 100644
index 0000000..183a350
--- /dev/null
+++ b/redisgraph_bulk_loader/configs.py
@@ -0,0 +1,9 @@
+# Default values for command-line arguments
+
+max_token_count = 1024 * 1023
+max_buffer_size = 0
+max_token_size = 512 * 1000000
+skip_invalid_nodes = False
+skip_invalid_edges = False
+separator = ','
+quoting = 3
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
new file mode 100644
index 0000000..a7a4be5
--- /dev/null
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -0,0 +1,194 @@
+import os
+import io
+import csv
+import math
+import struct
+import configs
+from exceptions import CSVError, SchemaError
+
+
+class Type:
+    NULL = 0
+    BOOL = 1
+    DOUBLE = 2
+    STRING = 3
+    LONG = 4
+    ID = 5
+    START_ID = 8
+    END_ID = 9
+    IGNORE = 10
+
+
+def convert_schema_type(in_type):
+    try:
+        return {
+                'null': Type.NULL,
+                'boolean': Type.BOOL,
+                'double': Type.DOUBLE,
+                'string': Type.STRING,
+                'string[]': Type.STRING, # TODO tmp
+                'integer': Type.LONG,
+                'int': Type.LONG,
+                'long': Type.LONG,
+                'id': Type.ID,
+                'start_id': Type.START_ID,
+                'end_id': Type.END_ID
+                }[in_type]
+    except KeyError:
+        # TODO tmp
+        if in_type.startswith('id('):
+            return Type.ID
+        elif in_type.startswith('start_id('):
+            return Type.START_ID
+        elif in_type.startswith('end_id('):
+            return Type.END_ID
+        else:
+            raise SchemaError("Encountered invalid field type '%s'" % in_type)
+
+
+# Convert a single CSV property field into a binary stream.
+# Supported property types are string, numeric, boolean, and NULL.
+# type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
+def prop_to_binary(prop_val, prop_type):
+    # All format strings start with an unsigned char to represent our prop_type enum
+    format_str = "=B"
+    if prop_val is None:
+        # An empty field indicates a NULL property
+        return struct.pack(format_str, Type.NULL)
+
+    # If field can be cast to a float, allow it
+    if prop_type is None or prop_type == Type.DOUBLE:
+        try:
+            numeric_prop = float(prop_val)
+            if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
+                return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
+        except:
+            raise SchemaError("Could not parse '%s' as a double" % prop_val)
+
+    # TODO add support for non-integer ID types
+    if prop_type is None or prop_type == Type.LONG or prop_type == Type.ID:
+        try:
+            numeric_prop = int(float(prop_val))
+            return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+        except:
+            raise SchemaError("Could not parse '%s' as a long" % prop_val)
+
+    if prop_type is None or prop_type == Type.BOOL:
+        # If field is 'false' or 'true', it is a boolean
+        if prop_val.lower() == 'false':
+            return struct.pack(format_str + '?', Type.BOOL, False)
+        elif prop_val.lower() == 'true':
+            return struct.pack(format_str + '?', Type.BOOL, True)
+
+    if prop_type is None or prop_type == Type.STRING:
+        # If we've reached this point, the property is a string
+        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
+        # Encoding len+1 adds a null terminator to the string
+        format_str += "%ds" % (len(encoded_str) + 1)
+        return struct.pack(format_str, Type.STRING, encoded_str)
+
+    # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
+    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
+
+
+# Superclass for label and relation CSV files
+class EntityFile(object):
+    def __init__(self, filename, label):
+        # The label or relation type string is the basename of the file
+        if label:
+            self.entity_str = label
+        else:
+            self.entity_str = os.path.splitext(os.path.basename(filename))[0]
+        # Input file handling
+        self.infile = io.open(filename, 'rt')
+
+        # Initialize CSV reader that ignores leading whitespace in each field
+        # and does not modify input quote characters
+        self.reader = csv.reader(self.infile, delimiter=configs.separator, skipinitialspace=True, quoting=configs.quoting)
+
+        self.packed_header = b''
+        self.binary_entities = []
+        self.binary_size = 0 # size of binary token
+
+        self.convert_header() # Extract data from header row.
+        self.count_entities() # Count number of entities/row in file.
+
+    # Count number of rows in file.
+    def count_entities(self):
+        self.entities_count = 0
+        self.entities_count = sum(1 for line in self.infile)
+        # seek back
+        self.infile.seek(0)
+        return self.entities_count
+
+    # Simple input validations for each row of a CSV file
+    def validate_row(self, row):
+        # Each row should have the same number of fields
+        if len(row) != self.column_count:
+            raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
+
+    # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
+    def reset_partial_binary(self):
+        self.binary_entities = []
+        self.binary_size = len(self.packed_header)
+
+    # Convert property keys from a CSV file header into a binary string
+    def pack_header(self):
+        # String format
+        entity_bytes = self.entity_str.encode()
+        fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
+        args = [entity_bytes, self.prop_count]
+        for idx in range(self.column_count):
+            if self.skip_offsets[idx]:
+                continue
+            prop = self.column_names[idx].encode()
+            fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
+            args.append(prop)
+        return struct.pack(fmt, *args)
+
+    # Extract column names and types from a header row
+    def convert_header(self):
+        header = next(self.reader)
+        self.column_count = len(header)
+        self.column_names = [None] * self.column_count   # Property names of every column.
+        self.types = [None] * self.column_count          # Value type of every column.
+        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
+
+        for idx, field in enumerate(header):
+            pair = field.split(':')
+            if len(pair) > 2:
+                raise CSVError("Field '%s' had %d colons" % field, len(field))
+
+            if len(pair[0]) == 0: # Delete empty string in a case like ":LABEL"
+                del pair[0]
+
+            if len(pair) < 2:
+                self.types[idx] = convert_schema_type(pair[0].casefold())
+                self.skip_offsets[idx] = True
+                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+                    # Any other field should have 2 elements
+                    raise SchemaError("Each property in the header should be a colon-separated pair")
+            else:
+                self.column_names[idx] = pair[0]
+                self.types[idx] = convert_schema_type(pair[1].casefold())
+                if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE):
+                    self.skip_offsets[idx] = True
+
+        # The number of properties is equal to the number of non-skipped columns.
+        self.prop_count = self.skip_offsets.count(False)
+        self.packed_header = self.pack_header()
+        self.binary_size += len(self.packed_header)
+
+    # Convert a list of properties into a binary string
+    def pack_props(self, line):
+        props = []
+        for idx, field in enumerate(line):
+            if self.skip_offsets[idx]:
+                continue
+            if self.column_names[idx]:
+                props.append(prop_to_binary(field, self.types[idx]))
+        return b''.join(p for p in props)
+
+    def to_binary(self):
+        return self.packed_header + b''.join(self.binary_entities)
diff --git a/redisgraph_bulk_loader/exceptions.py b/redisgraph_bulk_loader/exceptions.py
new file mode 100644
index 0000000..38e6baa
--- /dev/null
+++ b/redisgraph_bulk_loader/exceptions.py
@@ -0,0 +1,7 @@
+# Custom error class for invalid inputs
+class CSVError(Exception):
+    pass
+
+
+class SchemaError(Exception):
+    pass
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
new file mode 100644
index 0000000..03c1760
--- /dev/null
+++ b/redisgraph_bulk_loader/label.py
@@ -0,0 +1,65 @@
+import re
+import sys
+import click
+import configs
+import query_buffer as QueryBuffer
+from entity_file import Type, EntityFile
+from exceptions import SchemaError
+
+
+# Handler class for processing label csv files.
+class Label(EntityFile):
+    def __init__(self, infile, label_str):
+        super(Label, self).__init__(infile, label_str)
+        self.post_process_header()
+
+    def post_process_header(self):
+        # Verify that exactly one field is labeled ID.
+        if self.types.count(Type.ID) != 1:
+            raise SchemaError("Node file '%s' should have exactly one ID column."
+                              % (self.infile.name))
+        header = next(self.reader)
+        self.id = self.types.index(Type.ID) # Track the offset containing the node ID.
+        id_field = header[self.id]
+        self.id_namespace = None
+        # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace.
+        match = re.search(r"\((\w+)\)", id_field)
+        if match:
+            self.id_namespace = match.group(1)
+
+    def process_entities(self):
+        entities_created = 0
+        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
+            for row in reader:
+                self.validate_row(row)
+                # Add identifier->ID pair to dictionary if we are building relations
+                if QueryBuffer.nodes is not None:
+                    id_field = row[self.id]
+                    if self.id_namespace is not None:
+                        id_field = self.id_namespace + '.' + str(id_field)
+
+                    if id_field in QueryBuffer.nodes:
+                        sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
+                                         % (row[self.id], self.infile.name, self.reader.line_num))
+                        if configs.skip_invalid_nodes is False:
+                            sys.exit(1)
+                    QueryBuffer.nodes[id_field] = QueryBuffer.top_node_id
+                    QueryBuffer.top_node_id += 1
+                row_binary = self.pack_props(row)
+                row_binary_len = len(row_binary)
+                # If the addition of this entity will make the binary token grow too large,
+                # send the buffer now.
+                if self.binary_size + row_binary_len > configs.max_token_size:
+                    QueryBuffer.labels.append(self.to_binary())
+                    QueryBuffer.send_buffer()
+                    self.reset_partial_binary()
+                    # Push the label onto the query buffer again, as there are more entities to process.
+                    QueryBuffer.labels.append(self.to_binary())
+
+                QueryBuffer.node_count += 1
+                entities_created += 1
+                self.binary_size += row_binary_len
+                self.binary_entities.append(row_binary)
+            QueryBuffer.labels.append(self.to_binary())
+        self.infile.close()
+        print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/redisgraph_bulk_loader/query_buffer.py b/redisgraph_bulk_loader/query_buffer.py
new file mode 100644
index 0000000..12cfe91
--- /dev/null
+++ b/redisgraph_bulk_loader/query_buffer.py
@@ -0,0 +1,72 @@
+# QueryBuffer is the singleton module that processes input CSVs and emits their binary formats to the Redis client.
+
+nodes = None
+top_node_id = 0
+
+# Redis client and data for each query
+client = None
+
+# Sizes for buffer currently being constructed
+redis_token_count = 0
+buffer_size = 0
+
+# The first query should include a "BEGIN" token
+graphname = ""
+initial_query = True
+
+node_count = 0
+relation_count = 0
+
+labels = [] # List containing all pending Label objects
+reltypes = [] # List containing all pending RelationType objects
+
+nodes_created = 0 # Total number of nodes created
+relations_created = 0 # Total number of relations created
+
+
+# Send all pending inserts to Redis
+def send_buffer():
+    global initial_query
+    global nodes_created
+    global relations_created
+
+    # Do nothing if we have no entities
+    if node_count == 0 and relation_count == 0:
+        return
+
+    args = [node_count, relation_count, len(labels), len(reltypes)] + labels + reltypes
+    # Prepend a "BEGIN" token if this is the first query
+    if initial_query:
+        args.insert(0, "BEGIN")
+        initial_query = False
+
+    result = client.execute_command("GRAPH.BULK", graphname, *args)
+    stats = result.split(', '.encode())
+    nodes_created += int(stats[0].split(' '.encode())[0])
+    relations_created += int(stats[1].split(' '.encode())[0])
+
+    clear_buffer()
+
+
+# Delete all entities that have been inserted
+def clear_buffer():
+    global redis_token_count
+    global buffer_size
+    global node_count
+    global relation_count
+    global labels
+    global reltypes
+
+    redis_token_count = 0
+    buffer_size = 0
+
+    # All constructed entities have been inserted, so clear buffers
+    node_count = 0
+    relation_count = 0
+    del labels[:]
+    del reltypes[:]
+
+
+def report_completion(runtime):
+    print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
+          % (graphname, nodes_created, relations_created, runtime))
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
new file mode 100644
index 0000000..6baf810
--- /dev/null
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -0,0 +1,80 @@
+import re
+import struct
+import click
+import configs
+import query_buffer as QueryBuffer
+from entity_file import Type, EntityFile
+from exceptions import CSVError, SchemaError
+
+
+# Handler class for processing relation csv files.
+class RelationType(EntityFile):
+    def __init__(self, infile, type_str):
+        super(RelationType, self).__init__(infile, type_str)
+        if self.column_count < 2:
+            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
+                           % (infile.name))
+
+        self.start_id = -1
+        self.end_id = -1
+        self.post_process_header()
+
+    def post_process_header(self):
+        # Can interleave these tasks if preferred.
+        if self.types.count(Type.START_ID) != 1:
+            raise SchemaError("Relation file '%s' should have exactly one START_ID column."
+                              % (self.infile.name))
+        if self.types.count(Type.END_ID) != 1:
+            raise SchemaError("Relation file '%s' should have exactly one END_ID column."
+                              % (self.infile.name))
+
+        self.start_id = self.types.index(Type.START_ID)
+        self.end_id = self.types.index(Type.END_ID)
+        # Capture namespaces of start and end IDs if provided
+        header = next(self.reader)
+        start_match = re.search(r"\((\w+)\)", header[self.start_id])
+        if start_match:
+            self.start_namespace = start_match.group(1)
+        end_match = re.search(r"\((\w+)\)", header[self.end_id])
+        if end_match:
+            self.end_namespace = end_match.group(1)
+
+    def process_entities(self):
+        entities_created = 0
+        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
+            for row in reader:
+                self.validate_row(row)
+                try:
+                    start_id = row[self.start_id]
+                    if self.start_namespace:
+                        start_id = self.start_namespace + '.' + str(start_id)
+                    end_id = row[self.end_id]
+                    if self.end_namespace:
+                        end_id = self.end_namespace + '.' + str(end_id)
+
+                    src = QueryBuffer.nodes[start_id]
+                    dest = QueryBuffer.nodes[end_id]
+                except KeyError as e:
+                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
+                    if configs.skip_invalid_edges is False:
+                        raise e
+                    continue
+                fmt = "=QQ" # 8-byte unsigned ints for src and dest
+                row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
+                row_binary_len = len(row_binary)
+                # If the addition of this entity will make the binary token grow too large,
+                # send the buffer now.
+                if self.binary_size + row_binary_len > configs.max_token_size:
+                    QueryBuffer.reltypes.append(self.to_binary())
+                    QueryBuffer.send_buffer()
+                    self.reset_partial_binary()
+                    # Push the reltype onto the query buffer again, as there are more entities to process.
+                    QueryBuffer.reltypes.append(self.to_binary())
+
+                QueryBuffer.relation_count += 1
+                entities_created += 1
+                self.binary_size += row_binary_len
+                self.binary_entities.append(row_binary)
+            QueryBuffer.reltypes.append(self.to_binary())
+        self.infile.close()
+        print("%d relations created for type '%s'" % (entities_created, self.entity_str))

From 7b354644e5c094ce94955a01cfc30792a23176a3 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 11 May 2020 15:08:33 -0400
Subject: [PATCH 02/26] wip

---
 redisgraph_bulk_loader/config.py  | 9 +++++++++
 redisgraph_bulk_loader/configs.py | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)
 create mode 100644 redisgraph_bulk_loader/config.py
 delete mode 100644 redisgraph_bulk_loader/configs.py

diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
new file mode 100644
index 0000000..04bc3e8
--- /dev/null
+++ b/redisgraph_bulk_loader/config.py
@@ -0,0 +1,9 @@
+class Config:
+    """Default values for command-line arguments"""
+    max_token_count = 1024 * 1023
+    max_buffer_size = 0
+    max_token_size = 512 * 1000000
+    skip_invalid_nodes = False
+    skip_invalid_edges = False
+    separator = ','
+    quoting = 3
diff --git a/redisgraph_bulk_loader/configs.py b/redisgraph_bulk_loader/configs.py
deleted file mode 100644
index 183a350..0000000
--- a/redisgraph_bulk_loader/configs.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Default values for command-line arguments
-
-max_token_count = 1024 * 1023
-max_buffer_size = 0
-max_token_size = 512 * 1000000
-skip_invalid_nodes = False
-skip_invalid_edges = False
-separator = ','
-quoting = 3

From 3ed16936c75697c866509f51e1b913850acb1c01 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 11 May 2020 15:33:36 -0400
Subject: [PATCH 03/26] Fix module link resolution

---
 redisgraph_bulk_loader/__init__.py      | 11 ++---------
 redisgraph_bulk_loader/bulk_insert.py   | 25 ++++++++++++++-----------
 redisgraph_bulk_loader/entity_file.py   |  6 +++---
 redisgraph_bulk_loader/label.py         |  6 +++---
 redisgraph_bulk_loader/relation_type.py |  6 +++---
 test/test_bulk_loader.py                |  2 +-
 6 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/redisgraph_bulk_loader/__init__.py b/redisgraph_bulk_loader/__init__.py
index 4dd3fd0..58aea9a 100644
--- a/redisgraph_bulk_loader/__init__.py
+++ b/redisgraph_bulk_loader/__init__.py
@@ -1,12 +1,5 @@
-from .label import Label
-from .relation_type import RelationType
-from .query_buffer import QueryBuffer
-from .exceptions import (
-        CSVError,
-        SchemaError
-)
-from redisgraph_bulk_loader import bulk_insert
-
+# TODO delete if unused
+#  from redisgraph_bulk_loader import bulk_insert
 __all__ = [
     'bulk_insert',
 ]
diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 0ec542b..85633fb 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -1,8 +1,11 @@
+import os
 import sys
-from timeit import default_timer as timer
 import redis
 import click
-import configs
+from timeit import default_timer as timer
+
+sys.path.append(os.path.dirname(__file__))
+from config import Config
 import query_buffer as QueryBuffer
 from label import Label
 from relation_type import RelationType
@@ -28,8 +31,8 @@ def process_entities(entities):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (QueryBuffer.buffer_size + added_size >= configs.max_buffer_size
-                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
+        if (QueryBuffer.buffer_size + added_size >= Config.max_buffer_size
+                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= Config.max_token_count):
             # Send and flush the buffer if appropriate
             QueryBuffer.send_buffer()
         # Add binary data to list and update all counts
@@ -42,17 +45,17 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_no
     # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
     # that we can safely ignore tokens that aren't binary strings
     # ("GRAPH.BULK", "BEGIN", graph name, counts)
-    configs.max_token_count = min(max_token_count, 1024 * 1023)
+    Config.max_token_count = min(max_token_count, 1024 * 1023)
     # Maximum size in bytes per query
-    configs.max_buffer_size = max_buffer_size * 1000000
+    Config.max_buffer_size = max_buffer_size * 1000000
     # Maximum size in bytes per token
     # 512 megabytes is a hard-coded Redis maximum
-    configs.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
+    Config.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
 
-    configs.skip_invalid_nodes = skip_invalid_nodes
-    configs.skip_invalid_edges = skip_invalid_edges
-    configs.separator = separator
-    configs.quoting = quoting
+    Config.skip_invalid_nodes = skip_invalid_nodes
+    Config.skip_invalid_edges = skip_invalid_edges
+    Config.separator = separator
+    Config.quoting = quoting
 
 
 def QueryBuf_Set(graphname, client, has_relations):
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index a7a4be5..b901c4b 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -3,7 +3,7 @@
 import csv
 import math
 import struct
-import configs
+from config import Config
 from exceptions import CSVError, SchemaError
 
 
@@ -104,7 +104,7 @@ def __init__(self, filename, label):
 
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=configs.separator, skipinitialspace=True, quoting=configs.quoting)
+        self.reader = csv.reader(self.infile, delimiter=Config.separator, skipinitialspace=True, quoting=Config.quoting)
 
         self.packed_header = b''
         self.binary_entities = []
@@ -126,7 +126,7 @@ def validate_row(self, row):
         # Each row should have the same number of fields
         if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), Config.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 03c1760..1c7ce8c 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -1,7 +1,7 @@
 import re
 import sys
 import click
-import configs
+from config import Config
 import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import SchemaError
@@ -41,7 +41,7 @@ def process_entities(self):
                     if id_field in QueryBuffer.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[self.id], self.infile.name, self.reader.line_num))
-                        if configs.skip_invalid_nodes is False:
+                        if Config.skip_invalid_nodes is False:
                             sys.exit(1)
                     QueryBuffer.nodes[id_field] = QueryBuffer.top_node_id
                     QueryBuffer.top_node_id += 1
@@ -49,7 +49,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > configs.max_token_size:
+                if self.binary_size + row_binary_len > Config.max_token_size:
                     QueryBuffer.labels.append(self.to_binary())
                     QueryBuffer.send_buffer()
                     self.reset_partial_binary()
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 6baf810..a6b0205 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -1,7 +1,7 @@
 import re
 import struct
 import click
-import configs
+from config import Config
 import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import CSVError, SchemaError
@@ -56,7 +56,7 @@ def process_entities(self):
                     dest = QueryBuffer.nodes[end_id]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
-                    if configs.skip_invalid_edges is False:
+                    if Config.skip_invalid_edges is False:
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest
@@ -64,7 +64,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > configs.max_token_size:
+                if self.binary_size + row_binary_len > Config.max_token_size:
                     QueryBuffer.reltypes.append(self.to_binary())
                     QueryBuffer.send_buffer()
                     self.reset_partial_binary()
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index fbd224b..5456ef6 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -26,7 +26,7 @@ def row_count(in_csv):
     return idx
 
 
-class TestBulkInsert(unittest.TestCase):
+class TestBulkLoader(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         """

From 605bce88b6a0d8dccdea1f70200d7ed9fbcf084d Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 14 May 2020 12:09:38 -0400
Subject: [PATCH 04/26] Improve documentation

---
 README.md                             | 29 ++++++++++++++++-----------
 redisgraph_bulk_loader/bulk_insert.py | 15 ++++++++------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 1539e0d..1680624 100644
--- a/README.md
+++ b/README.md
@@ -25,18 +25,23 @@ pip install git+https://github.com/RedisGraph/redisgraph-bulk-loader.git@master
 ## Usage
 redisgraph-bulk-insert GRAPHNAME [OPTIONS]
 
-| Flags   | Extended flags        |    Parameter                                                    |
-|---------|-----------------------|-----------------------------------------------------------------|
-|  -h     | --host TEXT           |    Redis server host (default: 127.0.0.1)                       |
-|  -p     | --port INTEGER        |    Redis server port   (default: 6379)                          |
-|  -a     | --password TEXT       |    Redis server password                                        |
-|  -n     | --nodes TEXT          |    path to node csv file [required]                             |
-|  -r     | --relations TEXT      |    path to relationship csv file                                |
-|  -t     | --max-token-count INT |    max number of tokens sent in each Redis query (default 1024) |
-|  -b     | --max-buffer-size INT |    max batch size (MBs) of each Redis query (default 4096)      |
-|  -c     | --max-token-size INT  |    max size (MBs) of each token sent to Redis (default 500)     |
-|  -q     | --quote               |    the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3 |
-|  -f     | --field-types         |    json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)       |
+| Flags | Extended flags             |                                              Parameter                                               |
+|:-----:|----------------------------|:----------------------------------------------------------------------------------------------------:|
+|  -h   | --host TEXT                |                                Redis server host (default: 127.0.0.1)                                |
+|  -p   | --port INTEGER             |                                  Redis server port (default: 6379)                                   |
+|  -a   | --password TEXT            |                                        Redis server password                                         |
+|  -n   | --nodes TEXT               |                      Path to Node CSV file with the filename as the Node Label                       |
+|  -N   | --nodes-with-label TEXT    |                             Node Label followed by path to Node CSV file                             |
+|  -r   | --relations TEXT           |               Path to Relationship CSV file with the filename as the Relationship Type               |
+|  -R   | --relations-with-type TEXT |                     Relationship Type followed by path to relationship CSV file                      |
+|  -o   | --separator CHAR           |                          Field token separator in CSV files (default comma)                          |
+|  -d   | --enforce-schema           |               Requires each cell to adhere to the schema defined in the schema header                |
+|  -s   | --skip-invalid-nodes       |            Skip nodes that reuse previously defined IDs instead of exiting with an error             |
+|  -e   | --skip-invalid-edges       |            Skip edges that use invalid IDs for endpoints instead of exiting with an error            |
+|  -q   | --quote INT                | The quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3 |
+|  -t   | --max-token-count INT      |            (Debug argument) Max number of tokens sent in each Redis query (default 1024)             |
+|  -b   | --max-buffer-size INT      |               (Debug argument) Max batch size (MBs) of each Redis query (default 4096)               |
+|  -c   | --max-token-size INT       |              (Debug argument) Max size (MBs) of each token sent to Redis (default 500)               |
 
 
 The only required arguments are the name to give the newly-created graph (which can appear anywhere) and at least one node CSV file.
diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 85633fb..bc74b62 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -40,7 +40,7 @@ def process_entities(entities):
         QueryBuffer.buffer_size += added_size
 
 
-def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
+def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
     # Maximum number of tokens per query
     # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
     # that we can safely ignore tokens that aren't binary strings
@@ -52,6 +52,7 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_no
     # 512 megabytes is a hard-coded Redis maximum
     Config.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
 
+    Config.enforce_schema = enforce_schema
     Config.skip_invalid_nodes = skip_invalid_nodes
     Config.skip_invalid_edges = skip_invalid_edges
     Config.separator = separator
@@ -81,19 +82,21 @@ def QueryBuf_Set(graphname, client, has_relations):
 @click.option('--relations', '-r', multiple=True, help='Path to relation csv file')
 @click.option('--relations-with-type', '-R', nargs=2, multiple=True, help='Relation type string followed by path to relation csv file')
 @click.option('--separator', '-o', default=',', help='Field token separator in csv file')
+# Schema options
+@click.option('--enforce-schema', '-d', default=False, is_flag=True, help='Enforce the schema described in CSV header rows')
+@click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
+@click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
+@click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
 # Buffer size restrictions
 @click.option('--max-token-count', '-c', default=1024, help='max number of processed CSVs to send per query (default 1024)')
 @click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
 @click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
-@click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
-@click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
-@click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations, relations_with_type, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
+def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations, relations_with_type, separator, enforce_schema, skip_invalid_nodes, skip_invalid_edges, quote, max_token_count, max_buffer_size, max_token_size):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
     # Initialize configurations with command-line arguments
-    Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
+    Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
     start_time = timer()
     # Attempt to connect to Redis server

From 51c18d40b1f453ab3bafc02dd4a9603befacd0ef Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 14 May 2020 14:23:59 -0400
Subject: [PATCH 05/26] WIP

---
 redisgraph_bulk_loader/config.py        |  1 +
 redisgraph_bulk_loader/entity_file.py   | 61 ++++++++++++++++++-------
 redisgraph_bulk_loader/label.py         | 19 ++++++--
 redisgraph_bulk_loader/relation_type.py | 23 ++++++++--
 4 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
index 04bc3e8..a63d403 100644
--- a/redisgraph_bulk_loader/config.py
+++ b/redisgraph_bulk_loader/config.py
@@ -3,6 +3,7 @@ class Config:
     max_token_count = 1024 * 1023
     max_buffer_size = 0
     max_token_size = 512 * 1000000
+    enforce_schema = False
     skip_invalid_nodes = False
     skip_invalid_edges = False
     separator = ','
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index b901c4b..37641dc 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -17,6 +17,7 @@ class Type:
     START_ID = 8
     END_ID = 9
     IGNORE = 10
+    INFERRED = 11
 
 
 def convert_schema_type(in_type):
@@ -57,30 +58,34 @@ def prop_to_binary(prop_val, prop_type):
         return struct.pack(format_str, Type.NULL)
 
     # If field can be cast to a float, allow it
-    if prop_type is None or prop_type == Type.DOUBLE:
+    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.DOUBLE:
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
                 return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
         except:
-            raise SchemaError("Could not parse '%s' as a double" % prop_val)
+            # TODO ugly, rethink
+            if prop_type == Type.DOUBLE:
+                raise SchemaError("Could not parse '%s' as a double" % prop_val)
 
-    # TODO add support for non-integer ID types
-    if prop_type is None or prop_type == Type.LONG or prop_type == Type.ID:
+    #  if prop_type is None or prop_type == Type.LONG or prop_type == Type.ID:
+    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.LONG:
         try:
             numeric_prop = int(float(prop_val))
             return struct.pack(format_str + "q", Type.LONG, numeric_prop)
         except:
-            raise SchemaError("Could not parse '%s' as a long" % prop_val)
+            # TODO ugly, rethink
+            if prop_type == Type.LONG:
+                raise SchemaError("Could not parse '%s' as a long" % prop_val)
 
-    if prop_type is None or prop_type == Type.BOOL:
+    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
             return struct.pack(format_str + '?', Type.BOOL, False)
         elif prop_val.lower() == 'true':
             return struct.pack(format_str + '?', Type.BOOL, True)
 
-    if prop_type is None or prop_type == Type.STRING:
+    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.STRING:
         # If we've reached this point, the property is a string
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
@@ -112,6 +117,7 @@ def __init__(self, filename, label):
 
         self.convert_header() # Extract data from header row.
         self.count_entities() # Count number of entities/row in file.
+        next(self.reader) # Skip the header row.
 
     # Count number of rows in file.
     def count_entities(self):
@@ -147,34 +153,57 @@ def pack_header(self):
             args.append(prop)
         return struct.pack(fmt, *args)
 
-    # Extract column names and types from a header row
-    def convert_header(self):
-        header = next(self.reader)
-        self.column_count = len(header)
-        self.column_names = [None] * self.column_count   # Property names of every column.
-        self.types = [None] * self.column_count          # Value type of every column.
-        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
-
+    def convert_header_with_schema(self, header):
         for idx, field in enumerate(header):
             pair = field.split(':')
+
+            # Multiple colons found in column name, emit error.
+            # TODO might need to check for backtick escapes
             if len(pair) > 2:
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
 
+            # No colon in name, this column does not have an enforced data type.
+            if len(pair) == 1:
+                self.types[idx] = Type.INFERRED
+                self.column_names[idx] = pair[0]
+                if idx == 0:
+                    self.types[idx] = Type.ID # TODO problematic
+                continue
+
             if len(pair[0]) == 0: # Delete empty string in a case like ":LABEL"
                 del pair[0]
 
-            if len(pair) < 2:
+            if len(pair) == 1:
+                # We have a type, but no column name.
+                # TODO fold into above case?
                 self.types[idx] = convert_schema_type(pair[0].casefold())
                 self.skip_offsets[idx] = True
                 if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
                     # Any other field should have 2 elements
                     raise SchemaError("Each property in the header should be a colon-separated pair")
             else:
+                # We have a column name and a type.
                 self.column_names[idx] = pair[0]
                 self.types[idx] = convert_schema_type(pair[1].casefold())
                 if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE):
                     self.skip_offsets[idx] = True
 
+    def convert_header(self):
+        header = next(self.reader)
+        self.column_count = len(header)
+        self.column_names = [None] * self.column_count   # Property names of every column.
+        self.types = [Type.INFERRED] * self.column_count # Value type of every column.
+        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
+
+        if Config.enforce_schema:
+            # Use generic logic to convert the header with schema.
+            self.convert_header_with_schema(header)
+            # The subclass will perform post-processing.
+            self.post_process_header(header)
+        else:
+            # The subclass will process the header itself
+            self.process_schemaless_header(header)
+
         # The number of properties is equal to the number of non-skipped columns.
         self.prop_count = self.skip_offsets.count(False)
         self.packed_header = self.pack_header()
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 1c7ce8c..6a1bd74 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -10,18 +10,29 @@
 # Handler class for processing label csv files.
 class Label(EntityFile):
     def __init__(self, infile, label_str):
+        self.id_namespace = None
         super(Label, self).__init__(infile, label_str)
-        self.post_process_header()
+        #  self.post_process_header()
+
+    def process_schemaless_header(self, header):
+        # The first column is the ID.
+        # If this starts with an underscore, it is not a property and should not be introduced to the graph.
+        self.types[0] = Type.ID
+        self.id = 0
+        if header[0][0] == '_':
+            self.skip_offsets[0] = True
+        #  self.types[1:] = [Type.INFERRED] * self.column_count - 1
 
-    def post_process_header(self):
+        for idx, field in enumerate(header):
+            self.column_names[idx] = field
+
+    def post_process_header(self, header):
         # Verify that exactly one field is labeled ID.
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
                               % (self.infile.name))
-        header = next(self.reader)
         self.id = self.types.index(Type.ID) # Track the offset containing the node ID.
         id_field = header[self.id]
-        self.id_namespace = None
         # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace.
         match = re.search(r"\((\w+)\)", id_field)
         if match:
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index a6b0205..28a326f 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -15,11 +15,25 @@ def __init__(self, infile, type_str):
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (infile.name))
 
-        self.start_id = -1
-        self.end_id = -1
-        self.post_process_header()
+        self.start_id = 0
+        self.end_id = 1
+        #  self.post_process_header()
 
-    def post_process_header(self):
+    def process_schemaless_header(self, header):
+        # The first column is the source ID and the second is the destination ID.
+        self.types[0] = Type.START_ID
+        self.types[1] = Type.END_ID
+        self.skip_offsets[0] = True
+        self.skip_offsets[1] = True
+        self.start_namespace = None
+        self.end_namespace = None
+
+        #  self.types[2:] = [Type.INFERRED] * self.column_count - 2
+
+        for idx, field in enumerate(header):
+            self.column_names[idx] = field
+
+    def post_process_header(self, header):
         # Can interleave these tasks if preferred.
         if self.types.count(Type.START_ID) != 1:
             raise SchemaError("Relation file '%s' should have exactly one START_ID column."
@@ -31,7 +45,6 @@ def post_process_header(self):
         self.start_id = self.types.index(Type.START_ID)
         self.end_id = self.types.index(Type.END_ID)
         # Capture namespaces of start and end IDs if provided
-        header = next(self.reader)
         start_match = re.search(r"\((\w+)\)", header[self.start_id])
         if start_match:
             self.start_namespace = start_match.group(1)

From eae96e3ce6217bc71d6ad158ea5f6e032262ffa2 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 15 May 2020 12:55:53 -0400
Subject: [PATCH 06/26] Fix relation existence check

---
 redisgraph_bulk_loader/bulk_insert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index bc74b62..48badcf 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -122,7 +122,7 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    QueryBuf_Set(graph, client, relations is not None)
+    QueryBuf_Set(graph, client, any(relations))
 
     # Read the header rows of each input CSV and save its schema.
     labels = parse_schemas(Label, nodes, nodes_with_label)

From 4c791f1e60d5756f0511b6a0ed312587bc473309 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 15 May 2020 12:56:05 -0400
Subject: [PATCH 07/26] Tests don't catch exceptions

---
 test/test_bulk_loader.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 5456ef6..fdfa046 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -195,7 +195,7 @@ def test02_private_identifiers(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should report 3 node creations and 2 edge creations
         self.assertEqual(res.exit_code, 0)
@@ -227,7 +227,7 @@ def test03_reused_identifier(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should fail because a node identifier is reused
         self.assertNotEqual(res.exit_code, 0)
@@ -236,7 +236,7 @@ def test03_reused_identifier(self):
         # Run the script again without creating relations
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should succeed and create 3 nodes
         self.assertEqual(res.exit_code, 0)
@@ -262,7 +262,7 @@ def test04_batched_build(self):
                                           '--relations', knows_file,
                                           '--relations', visited_file,
                                           '--max-token-count', 1,
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should report 27 overall node creations and 48 edge creations.
         self.assertEqual(res.exit_code, 0)
@@ -299,7 +299,7 @@ def test05_script_failures(self):
 
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should fail because a row has the wrong number of fields
         self.validate_exception(res, "Expected 2 columns")
@@ -318,7 +318,7 @@ def test05_script_failures(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should fail because a row has the wrong number of fields
         self.validate_exception(res, "should have at least 2 elements")
@@ -331,7 +331,7 @@ def test05_script_failures(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # The script should fail because an invalid node identifier was used
         self.validate_exception(res, "fakeidentifier")
@@ -357,7 +357,7 @@ def test06_property_types(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         self.assertEqual(res.exit_code, 0)
         self.assertIn('3 nodes created', res.output)
@@ -391,7 +391,7 @@ def test07_utf8(self):
 
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         assert res.exit_code == 0
         assert '9 nodes created' in res.output
@@ -429,7 +429,7 @@ def test08_nonstandard_separators(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--separator', '|',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         self.assertEqual(res.exit_code, 0)
         self.assertIn('2 nodes created', res.output)
@@ -455,7 +455,7 @@ def test09_field_types(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--field-types', '{"nodes":[3, 2]}',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         self.assertEqual(res.exit_code, 0)
         self.assertIn('2 nodes created', res.output)
@@ -484,7 +484,7 @@ def test10_invalid_field_types(self):
         # Try to parse all cells as numerics
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--field-types', '{"nodes":[2]}',
-                                          graphname])
+                                          graphname], catch_exceptions=False)
 
         # Expect an error.
         self.validate_exception(res, "unable to parse")

From f6b67af5513d50b2f14795207264a551833c6eaa Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 18 May 2020 14:11:54 -0400
Subject: [PATCH 08/26] Refactor QueryBuffer

---
 redisgraph_bulk_loader/bulk_insert.py   |  39 +++----
 redisgraph_bulk_loader/label.py         |  22 ++--
 redisgraph_bulk_loader/query_buffer.py  | 136 +++++++++++-------------
 redisgraph_bulk_loader/relation_type.py |  24 ++---
 test/test_bulk_loader.py                |   6 +-
 5 files changed, 107 insertions(+), 120 deletions(-)

diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 48badcf..9c26e2e 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -6,21 +6,22 @@
 
 sys.path.append(os.path.dirname(__file__))
 from config import Config
-import query_buffer as QueryBuffer
+from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
 
 
-def parse_schemas(cls, path_to_csv, csv_tuples):
+
+def parse_schemas(cls, query_buf, path_to_csv, csv_tuples):
     schemas = [None] * (len(path_to_csv) + len(csv_tuples))
     for idx, in_csv in enumerate(path_to_csv):
         # Build entity descriptor from input CSV
-        schemas[idx] = cls(in_csv, None)
+        schemas[idx] = cls(query_buf, in_csv, None)
 
     offset = len(path_to_csv)
     for idx, csv_tuple in enumerate(csv_tuples):
         # Build entity descriptor from input CSV
-        schemas[idx + offset] = cls(csv_tuple[1], csv_tuple[0])
+        schemas[idx + offset] = cls(query_buf, csv_tuple[1], csv_tuple[0])
     return schemas
 
 
@@ -31,13 +32,13 @@ def process_entities(entities):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (QueryBuffer.buffer_size + added_size >= Config.max_buffer_size
-                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= Config.max_token_count):
+        if (entity.query_buffer.buffer_size + added_size >= Config.max_buffer_size
+                or entity.query_buffer.redis_token_count + len(entity.binary_entities) >= Config.max_token_count):
             # Send and flush the buffer if appropriate
-            QueryBuffer.send_buffer()
+            entity.query_buffer.send_buffer()
         # Add binary data to list and update all counts
-        QueryBuffer.redis_token_count += len(entity.binary_entities)
-        QueryBuffer.buffer_size += added_size
+        entity.query_buffer.redis_token_count += len(entity.binary_entities)
+        entity.query_buffer.buffer_size += added_size
 
 
 def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
@@ -59,14 +60,6 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema,
     Config.quoting = quoting
 
 
-def QueryBuf_Set(graphname, client, has_relations):
-    # Redis client and data for each query
-    QueryBuffer.client = client
-    QueryBuffer.graphname = graphname
-
-    # Create a node dictionary if we're building relations and as such require unique identifiers
-    if has_relations:
-        QueryBuffer.nodes = {}
 
 
 # Command-line arguments
@@ -122,20 +115,22 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    QueryBuf_Set(graph, client, any(relations))
+    # If relations are being built, we must store unique node identifiers to later resolve endpoints.
+    store_node_identifiers = any(relations)
+    query_buf = QueryBuffer(graph, client, store_node_identifiers)
 
     # Read the header rows of each input CSV and save its schema.
-    labels = parse_schemas(Label, nodes, nodes_with_label)
-    reltypes = parse_schemas(RelationType, relations, relations_with_type)
+    labels = parse_schemas(Label, query_buf, nodes, nodes_with_label)
+    reltypes = parse_schemas(RelationType, query_buf, relations, relations_with_type)
 
     process_entities(labels)
     process_entities(reltypes)
 
     # Send all remaining tokens to Redis
-    QueryBuffer.send_buffer()
+    query_buf.send_buffer()
 
     end_time = timer()
-    QueryBuffer.report_completion(end_time - start_time)
+    query_buf.report_completion(end_time - start_time)
 
 
 if __name__ == '__main__':
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 6a1bd74..dd7a1a0 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -2,15 +2,15 @@
 import sys
 import click
 from config import Config
-import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import SchemaError
 
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, infile, label_str):
+    def __init__(self, query_buffer, infile, label_str):
         self.id_namespace = None
+        self.query_buffer = query_buffer
         super(Label, self).__init__(infile, label_str)
         #  self.post_process_header()
 
@@ -44,33 +44,33 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
-                if QueryBuffer.nodes is not None:
+                if self.query_buffer.nodes is not None:
                     id_field = row[self.id]
                     if self.id_namespace is not None:
                         id_field = self.id_namespace + '.' + str(id_field)
 
-                    if id_field in QueryBuffer.nodes:
+                    if id_field in self.query_buffer.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[self.id], self.infile.name, self.reader.line_num))
                         if Config.skip_invalid_nodes is False:
                             sys.exit(1)
-                    QueryBuffer.nodes[id_field] = QueryBuffer.top_node_id
-                    QueryBuffer.top_node_id += 1
+                    self.query_buffer.nodes[id_field] = self.query_buffer.top_node_id
+                    self.query_buffer.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > Config.max_token_size:
-                    QueryBuffer.labels.append(self.to_binary())
-                    QueryBuffer.send_buffer()
+                    self.query_buffer.labels.append(self.to_binary())
+                    self.query_buffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the label onto the query buffer again, as there are more entities to process.
-                    QueryBuffer.labels.append(self.to_binary())
+                    self.query_buffer.labels.append(self.to_binary())
 
-                QueryBuffer.node_count += 1
+                self.query_buffer.node_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            QueryBuffer.labels.append(self.to_binary())
+            self.query_buffer.labels.append(self.to_binary())
         self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/redisgraph_bulk_loader/query_buffer.py b/redisgraph_bulk_loader/query_buffer.py
index 12cfe91..5158d6b 100644
--- a/redisgraph_bulk_loader/query_buffer.py
+++ b/redisgraph_bulk_loader/query_buffer.py
@@ -1,72 +1,64 @@
-# QueryBuffer is the singleton module that processes input CSVs and emits their binary formats to the Redis client.
-
-nodes = None
-top_node_id = 0
-
-# Redis client and data for each query
-client = None
-
-# Sizes for buffer currently being constructed
-redis_token_count = 0
-buffer_size = 0
-
-# The first query should include a "BEGIN" token
-graphname = ""
-initial_query = True
-
-node_count = 0
-relation_count = 0
-
-labels = [] # List containing all pending Label objects
-reltypes = [] # List containing all pending RelationType objects
-
-nodes_created = 0 # Total number of nodes created
-relations_created = 0 # Total number of relations created
-
-
-# Send all pending inserts to Redis
-def send_buffer():
-    global initial_query
-    global nodes_created
-    global relations_created
-
-    # Do nothing if we have no entities
-    if node_count == 0 and relation_count == 0:
-        return
-
-    args = [node_count, relation_count, len(labels), len(reltypes)] + labels + reltypes
-    # Prepend a "BEGIN" token if this is the first query
-    if initial_query:
-        args.insert(0, "BEGIN")
-        initial_query = False
-
-    result = client.execute_command("GRAPH.BULK", graphname, *args)
-    stats = result.split(', '.encode())
-    nodes_created += int(stats[0].split(' '.encode())[0])
-    relations_created += int(stats[1].split(' '.encode())[0])
-
-    clear_buffer()
-
-
-# Delete all entities that have been inserted
-def clear_buffer():
-    global redis_token_count
-    global buffer_size
-    global node_count
-    global relation_count
-    global labels
-    global reltypes
-
-    redis_token_count = 0
-    buffer_size = 0
-
-    # All constructed entities have been inserted, so clear buffers
-    node_count = 0
-    relation_count = 0
-    del labels[:]
-    del reltypes[:]
-
-
-def report_completion(runtime):
-    print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
-          % (graphname, nodes_created, relations_created, runtime))
+class QueryBuffer:
+    def __init__(self, graphname, client, store_identifiers):
+        self.nodes = None
+        self.top_node_id = 0
+
+        # Redis client and data for each query
+        self.client = client
+        self.graphname = graphname
+
+        # Create a node dictionary if we're building relations and as such require unique identifiers
+        if store_identifiers:
+            self.nodes = {}
+        else:
+            self.nodes = None
+
+        # Sizes for buffer currently being constructed
+        self.redis_token_count = 0
+        self.buffer_size = 0
+
+        # The first query should include a "BEGIN" token
+        self.graphname = graphname
+        self.initial_query = True
+
+        self.node_count = 0
+        self.relation_count = 0
+
+        self.labels = [] # List containing all pending Label objects
+        self.reltypes = [] # List containing all pending RelationType objects
+
+        self.nodes_created = 0 # Total number of nodes created
+        self.relations_created = 0 # Total number of relations created
+
+    def send_buffer(self):
+        """Send all pending inserts to Redis"""
+        # Do nothing if we have no entities
+        if self.node_count == 0 and self.relation_count == 0:
+            return
+
+        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
+        # Prepend a "BEGIN" token if this is the first query
+        if self.initial_query:
+            args.insert(0, "BEGIN")
+            self.initial_query = False
+
+        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
+        stats = result.split(', '.encode())
+        self.nodes_created += int(stats[0].split(' '.encode())[0])
+        self.relations_created += int(stats[1].split(' '.encode())[0])
+
+        self.clear_buffer()
+
+    # Delete all entities that have been inserted
+    def clear_buffer(self):
+        del self.labels[:]
+        del self.reltypes[:]
+
+        self.redis_token_count = 0
+        self.buffer_size = 0
+        self.node_count = 0
+        self.relation_count = 0
+
+    def report_completion(self, runtime):
+        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
+              % (self.graphname, self.nodes_created, self.relations_created, runtime))
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 28a326f..43b1d86 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -2,24 +2,24 @@
 import struct
 import click
 from config import Config
-import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import CSVError, SchemaError
 
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, infile, type_str):
+    def __init__(self, query_buffer, infile, type_str):
         super(RelationType, self).__init__(infile, type_str)
-        if self.column_count < 2:
-            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
-                           % (infile.name))
 
         self.start_id = 0
         self.end_id = 1
+        self.query_buffer = query_buffer
         #  self.post_process_header()
 
     def process_schemaless_header(self, header):
+        if self.column_count < 2:
+            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
+                           % (self.infile.name))
         # The first column is the source ID and the second is the destination ID.
         self.types[0] = Type.START_ID
         self.types[1] = Type.END_ID
@@ -65,8 +65,8 @@ def process_entities(self):
                     if self.end_namespace:
                         end_id = self.end_namespace + '.' + str(end_id)
 
-                    src = QueryBuffer.nodes[start_id]
-                    dest = QueryBuffer.nodes[end_id]
+                    src = self.query_buffer.nodes[start_id]
+                    dest = self.query_buffer.nodes[end_id]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if Config.skip_invalid_edges is False:
@@ -78,16 +78,16 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > Config.max_token_size:
-                    QueryBuffer.reltypes.append(self.to_binary())
-                    QueryBuffer.send_buffer()
+                    self.query_buffer.reltypes.append(self.to_binary())
+                    self.query_buffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the reltype onto the query buffer again, as there are more entities to process.
-                    QueryBuffer.reltypes.append(self.to_binary())
+                    self.query_buffer.reltypes.append(self.to_binary())
 
-                QueryBuffer.relation_count += 1
+                self.query_buffer.relation_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            QueryBuffer.reltypes.append(self.to_binary())
+            self.query_buffer.reltypes.append(self.to_binary())
         self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index fdfa046..11d2b3d 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -299,7 +299,7 @@ def test05_script_failures(self):
 
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          graphname], catch_exceptions=False)
+                                          graphname])
 
         # The script should fail because a row has the wrong number of fields
         self.validate_exception(res, "Expected 2 columns")
@@ -318,7 +318,7 @@ def test05_script_failures(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname], catch_exceptions=False)
+                                          graphname])
 
         # The script should fail because a row has the wrong number of fields
         self.validate_exception(res, "should have at least 2 elements")
@@ -331,7 +331,7 @@ def test05_script_failures(self):
         runner = CliRunner()
         res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
                                           '--relations', '/tmp/relations.tmp',
-                                          graphname], catch_exceptions=False)
+                                          graphname])
 
         # The script should fail because an invalid node identifier was used
         self.validate_exception(res, "fakeidentifier")

From 480690b89417f91c927dcd746fd13b47faf8dc59 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 18 May 2020 14:12:32 -0400
Subject: [PATCH 09/26] Comment out fields tests

---
 test/test_bulk_loader.py | 92 ++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 11d2b3d..8c40653 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -442,52 +442,52 @@ def test08_nonstandard_separators(self):
         # The graph should have the correct types for all properties
         self.assertEqual(query_result.result_set, expected_result)
 
-    def test09_field_types(self):
-        """Validate that the field-types argument is respected"""
-
-        graphname = "tmpgraph7"
-        with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            out = csv.writer(csv_file)
-            out.writerow(['str_col', 'num_col'])
-            out.writerow([0, 0])
-            out.writerow([1, 1])
-
-        runner = CliRunner()
-        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          '--field-types', '{"nodes":[3, 2]}',
-                                          graphname], catch_exceptions=False)
-
-        self.assertEqual(res.exit_code, 0)
-        self.assertIn('2 nodes created', res.output)
-
-        graph = Graph(graphname, self.redis_con)
-        query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
-        expected_result = [['0', 0],
-                           ['1', 1]]
-
-        # The graph should have the correct types for all properties
-        self.assertEqual(query_result.result_set, expected_result)
-
-    def test10_invalid_field_types(self):
-        """Validate that errors are emitted properly with an invalid field-types argument."""
-
-        graphname = "expect_fail"
-        with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            out = csv.writer(csv_file)
-            out.writerow(['num_col'])
-            out.writerow([5])
-            out.writerow([10])
-            out.writerow(['str'])
-            out.writerow([15])
-
-        runner = CliRunner()
-        # Try to parse all cells as numerics
-        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          '--field-types', '{"nodes":[2]}',
-                                          graphname], catch_exceptions=False)
-
-        # Expect an error.
-        self.validate_exception(res, "unable to parse")
+    #  def test09_field_types(self):
+        #  """Validate that the field-types argument is respected"""
+
+        #  graphname = "tmpgraph7"
+        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            #  out = csv.writer(csv_file)
+            #  out.writerow(['str_col', 'num_col'])
+            #  out.writerow([0, 0])
+            #  out.writerow([1, 1])
+
+        #  runner = CliRunner()
+        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          #  '--field-types', '{"nodes":[3, 2]}',
+                                          #  graphname], catch_exceptions=False)
+
+        #  self.assertEqual(res.exit_code, 0)
+        #  self.assertIn('2 nodes created', res.output)
+
+        #  graph = Graph(graphname, self.redis_con)
+        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
+        #  expected_result = [['0', 0],
+                           #  ['1', 1]]
+
+        #  # The graph should have the correct types for all properties
+        #  self.assertEqual(query_result.result_set, expected_result)
+
+    #  def test10_invalid_field_types(self):
+        #  """Validate that errors are emitted properly with an invalid field-types argument."""
+
+        #  graphname = "expect_fail"
+        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            #  out = csv.writer(csv_file)
+            #  out.writerow(['num_col'])
+            #  out.writerow([5])
+            #  out.writerow([10])
+            #  out.writerow(['str'])
+            #  out.writerow([15])
+
+        #  runner = CliRunner()
+        #  # Try to parse all cells as numerics
+        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          #  '--field-types', '{"nodes":[2]}',
+                                          #  graphname], catch_exceptions=False)
+
+        #  # Expect an error.
+        #  self.validate_exception(res, "unable to parse")
 
 
 if __name__ == '__main__':

From 671bf12fbee1707c4255608b4549afd70062174c Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Tue, 19 May 2020 13:08:53 -0400
Subject: [PATCH 10/26] Simplify node dictionary logic

---
 redisgraph_bulk_loader/bulk_insert.py  | 10 ++++++----
 redisgraph_bulk_loader/config.py       |  1 +
 redisgraph_bulk_loader/label.py        | 26 ++++++++++++++++----------
 redisgraph_bulk_loader/query_buffer.py |  7 +++++--
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 9c26e2e..d8b69b3 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -88,10 +88,14 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
+    start_time = timer()
+
     # Initialize configurations with command-line arguments
     Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
-    start_time = timer()
+    # If relations are being built, we must store unique node identifiers to later resolve endpoints.
+    Config.store_node_identifiers = any(relations) or any(relations_with_type)
+
     # Attempt to connect to Redis server
     try:
         client = redis.StrictRedis(host=host, port=port, password=password)
@@ -115,9 +119,7 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    # If relations are being built, we must store unique node identifiers to later resolve endpoints.
-    store_node_identifiers = any(relations)
-    query_buf = QueryBuffer(graph, client, store_node_identifiers)
+    query_buf = QueryBuffer(graph, client)
 
     # Read the header rows of each input CSV and save its schema.
     labels = parse_schemas(Label, query_buf, nodes, nodes_with_label)
diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
index a63d403..2632adc 100644
--- a/redisgraph_bulk_loader/config.py
+++ b/redisgraph_bulk_loader/config.py
@@ -6,5 +6,6 @@ class Config:
     enforce_schema = False
     skip_invalid_nodes = False
     skip_invalid_edges = False
+    store_node_identifiers = False
     separator = ','
     quoting = 3
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index dd7a1a0..ff9f315 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -6,8 +6,8 @@
 from exceptions import SchemaError
 
 
-# Handler class for processing label csv files.
 class Label(EntityFile):
+    """Handler class for processing Label CSV files."""
     def __init__(self, query_buffer, infile, label_str):
         self.id_namespace = None
         self.query_buffer = query_buffer
@@ -38,28 +38,34 @@ def post_process_header(self, header):
         if match:
             self.id_namespace = match.group(1)
 
+    def update_node_dictionary(self, identifier):
+        """Add identifier->ID pair to dictionary if we are building relations"""
+        if identifier in self.query_buffer.nodes:
+            sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
+                             % (identifier, self.infile.name, self.reader.line_num))
+            if Config.skip_invalid_nodes is False:
+                sys.exit(1)
+        self.query_buffer.nodes[identifier] = self.query_buffer.top_node_id
+        self.query_buffer.top_node_id += 1
+
     def process_entities(self):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
                 self.validate_row(row)
-                # Add identifier->ID pair to dictionary if we are building relations
-                if self.query_buffer.nodes is not None:
+
+                # Update the node identifier dictionary if necessary
+                if Config.store_node_identifiers:
                     id_field = row[self.id]
                     if self.id_namespace is not None:
                         id_field = self.id_namespace + '.' + str(id_field)
+                    self.update_node_dictionary(id_field)
 
-                    if id_field in self.query_buffer.nodes:
-                        sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
-                                         % (row[self.id], self.infile.name, self.reader.line_num))
-                        if Config.skip_invalid_nodes is False:
-                            sys.exit(1)
-                    self.query_buffer.nodes[id_field] = self.query_buffer.top_node_id
-                    self.query_buffer.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
+                # TODO how much of this can be made uniform w/ relations and moved to Querybuffer?
                 if self.binary_size + row_binary_len > Config.max_token_size:
                     self.query_buffer.labels.append(self.to_binary())
                     self.query_buffer.send_buffer()
diff --git a/redisgraph_bulk_loader/query_buffer.py b/redisgraph_bulk_loader/query_buffer.py
index 5158d6b..5ef0b64 100644
--- a/redisgraph_bulk_loader/query_buffer.py
+++ b/redisgraph_bulk_loader/query_buffer.py
@@ -1,5 +1,8 @@
+from config import Config
+
+
 class QueryBuffer:
-    def __init__(self, graphname, client, store_identifiers):
+    def __init__(self, graphname, client):
         self.nodes = None
         self.top_node_id = 0
 
@@ -8,7 +11,7 @@ def __init__(self, graphname, client, store_identifiers):
         self.graphname = graphname
 
         # Create a node dictionary if we're building relations and as such require unique identifiers
-        if store_identifiers:
+        if Config.store_node_identifiers:
             self.nodes = {}
         else:
             self.nodes = None

From 9ab43aa6f3befd105c0185ebc883b3d8cc0b8941 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 14:58:33 -0400
Subject: [PATCH 11/26] Add schema tests

---
 redisgraph_bulk_loader/bulk_insert.py  |  8 +--
 redisgraph_bulk_loader/entity_file.py  |  4 +-
 redisgraph_bulk_loader/label.py        |  4 ++
 redisgraph_bulk_loader/query_buffer.py |  1 +
 test/test_bulk_loader.py               | 92 +++++++++++++-------------
 5 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index d8b69b3..9d44bb9 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -11,7 +11,6 @@
 from relation_type import RelationType
 
 
-
 def parse_schemas(cls, query_buf, path_to_csv, csv_tuples):
     schemas = [None] * (len(path_to_csv) + len(csv_tuples))
     for idx, in_csv in enumerate(path_to_csv):
@@ -60,8 +59,6 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema,
     Config.quoting = quoting
 
 
-
-
 # Command-line arguments
 @click.command()
 @click.argument('graph')
@@ -70,7 +67,7 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema,
 @click.option('--port', '-p', default=6379, help='Redis server port')
 @click.option('--password', '-a', default=None, help='Redis server password')
 # CSV file paths
-@click.option('--nodes', '-n', required=True, multiple=True, help='Path to node csv file')
+@click.option('--nodes', '-n', multiple=True, help='Path to node csv file')
 @click.option('--nodes-with-label', '-N', nargs=2, multiple=True, help='Label string followed by path to node csv file')
 @click.option('--relations', '-r', multiple=True, help='Path to relation csv file')
 @click.option('--relations-with-type', '-R', nargs=2, multiple=True, help='Relation type string followed by path to relation csv file')
@@ -88,6 +85,9 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
+    if not (any(nodes) or any(nodes_with_label)):
+        raise Exception("At least one node file must be specified.")
+
     start_time = timer()
 
     # Initialize configurations with command-line arguments
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index 37641dc..35c2b91 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -26,6 +26,7 @@ def convert_schema_type(in_type):
                 'null': Type.NULL,
                 'boolean': Type.BOOL,
                 'double': Type.DOUBLE,
+                'float': Type.DOUBLE,
                 'string': Type.STRING,
                 'string[]': Type.STRING, # TODO tmp
                 'integer': Type.LONG,
@@ -33,7 +34,8 @@ def convert_schema_type(in_type):
                 'long': Type.LONG,
                 'id': Type.ID,
                 'start_id': Type.START_ID,
-                'end_id': Type.END_ID
+                'end_id': Type.END_ID,
+                'ignore': Type.IGNORE
                 }[in_type]
     except KeyError:
         # TODO tmp
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index ff9f315..416bb5c 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -27,6 +27,10 @@ def process_schemaless_header(self, header):
             self.column_names[idx] = field
 
     def post_process_header(self, header):
+        # No ID field is required if we're only inserting nodes.
+        if Config.store_node_identifiers is False:
+            return
+
         # Verify that exactly one field is labeled ID.
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
diff --git a/redisgraph_bulk_loader/query_buffer.py b/redisgraph_bulk_loader/query_buffer.py
index 5ef0b64..a72e70c 100644
--- a/redisgraph_bulk_loader/query_buffer.py
+++ b/redisgraph_bulk_loader/query_buffer.py
@@ -33,6 +33,7 @@ def __init__(self, graphname, client):
         self.nodes_created = 0 # Total number of nodes created
         self.relations_created = 0 # Total number of relations created
 
+    # TODO consider using a queue to send commands asynchronously
     def send_buffer(self):
         """Send all pending inserts to Redis"""
         # Do nothing if we have no entities
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 8c40653..f6f6d1d 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -442,52 +442,52 @@ def test08_nonstandard_separators(self):
         # The graph should have the correct types for all properties
         self.assertEqual(query_result.result_set, expected_result)
 
-    #  def test09_field_types(self):
-        #  """Validate that the field-types argument is respected"""
-
-        #  graphname = "tmpgraph7"
-        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            #  out = csv.writer(csv_file)
-            #  out.writerow(['str_col', 'num_col'])
-            #  out.writerow([0, 0])
-            #  out.writerow([1, 1])
-
-        #  runner = CliRunner()
-        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          #  '--field-types', '{"nodes":[3, 2]}',
-                                          #  graphname], catch_exceptions=False)
-
-        #  self.assertEqual(res.exit_code, 0)
-        #  self.assertIn('2 nodes created', res.output)
-
-        #  graph = Graph(graphname, self.redis_con)
-        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
-        #  expected_result = [['0', 0],
-                           #  ['1', 1]]
-
-        #  # The graph should have the correct types for all properties
-        #  self.assertEqual(query_result.result_set, expected_result)
-
-    #  def test10_invalid_field_types(self):
-        #  """Validate that errors are emitted properly with an invalid field-types argument."""
-
-        #  graphname = "expect_fail"
-        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            #  out = csv.writer(csv_file)
-            #  out.writerow(['num_col'])
-            #  out.writerow([5])
-            #  out.writerow([10])
-            #  out.writerow(['str'])
-            #  out.writerow([15])
-
-        #  runner = CliRunner()
-        #  # Try to parse all cells as numerics
-        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          #  '--field-types', '{"nodes":[2]}',
-                                          #  graphname], catch_exceptions=False)
-
-        #  # Expect an error.
-        #  self.validate_exception(res, "unable to parse")
+    def test09_schema(self):
+        """Validate that the enforce-schema argument is respected"""
+
+        graphname = "tmpgraph7"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['str_col:STRING', 'num_col:INT'])
+            out.writerow([0, 0])
+            out.writerow([1, 1])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          '--enforce-schema',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
+        expected_result = [['0', 0],
+                           ['1', 1]]
+
+        # The graph should have the correct types for all properties
+        self.assertEqual(query_result.result_set, expected_result)
+
+    def test10_invalid_schema(self):
+        """Validate that errors are emitted properly with an invalid CSV schema."""
+
+        graphname = "expect_fail"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['num_col:INT'])
+            out.writerow([5])
+            out.writerow([10])
+            out.writerow(['str'])
+            out.writerow([15])
+
+        runner = CliRunner()
+        # Try to parse all cells as integers
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          '--enforce-schema',
+                                          graphname])
+
+        # Expect an error.
+        self.validate_exception(res, "Could not parse")
 
 
 if __name__ == '__main__':

From fb3f1a10ed95c9381117bd3f041d9cd99d05162e Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 14:58:39 -0400
Subject: [PATCH 12/26] Update documentation

---
 README.md | 70 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 1680624..5a7957a 100644
--- a/README.md
+++ b/README.md
@@ -17,25 +17,33 @@ A Redis server with the [RedisGraph](https://github.com/RedisLabsModules/RedisGr
 https://oss.redislabs.com/redisgraph/
 
 ## Installation
-The bulk loader script's dependencies can be resolved using pip:
+The bulk loader can be installed using pip:
 ```
 pip install git+https://github.com/RedisGraph/redisgraph-bulk-loader.git@master
 ```
 
 ## Usage
+Pip installation exposes `redisgraph-bulk-insert` as a command to invoke this tool:
+```
 redisgraph-bulk-insert GRAPHNAME [OPTIONS]
+```
+
+Installation by cloning the repository allows the script to be invoked via Python like so:
+```
+python3 redisgraph_bulk_loader/bulk_insert.py GRAPHNAME [OPTIONS]
+```
 
 | Flags | Extended flags             |                                              Parameter                                               |
 |:-----:|----------------------------|:----------------------------------------------------------------------------------------------------:|
 |  -h   | --host TEXT                |                                Redis server host (default: 127.0.0.1)                                |
 |  -p   | --port INTEGER             |                                  Redis server port (default: 6379)                                   |
-|  -a   | --password TEXT            |                                        Redis server password                                         |
+|  -a   | --password TEXT            |                                Redis server password (default: none)                                 |
 |  -n   | --nodes TEXT               |                      Path to Node CSV file with the filename as the Node Label                       |
 |  -N   | --nodes-with-label TEXT    |                             Node Label followed by path to Node CSV file                             |
 |  -r   | --relations TEXT           |               Path to Relationship CSV file with the filename as the Relationship Type               |
 |  -R   | --relations-with-type TEXT |                     Relationship Type followed by path to relationship CSV file                      |
-|  -o   | --separator CHAR           |                          Field token separator in CSV files (default comma)                          |
-|  -d   | --enforce-schema           |               Requires each cell to adhere to the schema defined in the schema header                |
+|  -o   | --separator CHAR           |                         Field token separator in CSV files (default: comma)                          |
+|  -d   | --enforce-schema           |                 Requires each cell to adhere to the schema defined in the CSV header                 |
 |  -s   | --skip-invalid-nodes       |            Skip nodes that reuse previously defined IDs instead of exiting with an error             |
 |  -e   | --skip-invalid-edges       |            Skip edges that use invalid IDs for endpoints instead of exiting with an error            |
 |  -q   | --quote INT                | The quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3 |
@@ -47,27 +55,27 @@ redisgraph-bulk-insert GRAPHNAME [OPTIONS]
 The only required arguments are the name to give the newly-created graph (which can appear anywhere) and at least one node CSV file.
 The nodes and relationship flags should be specified once per input file.
 
-The flags for `max-token-count`, `max-buffer-size`, and `max-token-size` should only be specified if the memory overhead of graph creation is too high. The bulk loader builds large graphs by sending binary tokens (each of which holds multiple nodes or relations) to Redis in batches. By lowering these limits from their defaults, the size of each transmission to Redis is lowered and fewer entities are held in memory, at the expense of a longer overall runtime.
-
 ```
 redisgraph-bulk-insert GRAPH_DEMO -n example/Person.csv -n example/Country.csv -r example/KNOWS.csv -r example/VISITED.csv
 ```
 The label (for nodes) or relationship type (for relationships) is derived from the base name of the input CSV file. In this example, we'll construct two sets of nodes, labeled `Person` and `Country`, and two types of relationships - `KNOWS` and `VISITED`.
 
-The default behaviour is to infer the type for each row based on the value of each row, which can cause type mismatch problem. For example if a string property contains string values of 'false', 'true' or numbers. To avoid this, use --field-types to explicitly set the type for each column in the csv. 
-EG, to explicitly set to string.
+RedisGraph does not impose a schema on properties, so the same property key can have values of differing types, such as strings and integers. As such, the bulk loader's default behaviour is to infer the type for each field independently for each value. This can cause unexpected behaviors when, for example, a property expected to always have string values has a field that can be cast to an integer or double. To avoid this, use the `--enforce-schema` flag and update your CSV headers as described in [Input Schemas](#input-schemas).
 
-```
-redisgraph-bulk-insert ROBOTS -f '{"Robots" : [3]}' -q1 -n example2/Robots.csv 
-```
+### Extended parameter descriptions
+The flags for `max-token-count`, `max-buffer-size`, and `max-token-size` are typically not required. They should only be specified if the memory overhead of graph creation is too high. The bulk loader builds large graphs by sending binary tokens (each of which holds multiple nodes or relations) to Redis in batches. By lowering these limits from their defaults, the size of each transmission to Redis is lowered and fewer entities are held in memory, at the expense of a longer overall runtime.
 
-Notice that when -f isn't used, the robot name "30165" would be inserted as a number rather than a string which causes problems in RedisGraph when searching. 
+`--quote` is maintained for backwards compatibility, and allows some control over Python's type inference in the default mode. `--enforce-schema-type` is preferred.
+
+`--enforce-schema-type` indicates that input CSV headers will follow the form described in [Input Schemas](#input-schemas).
+
+`--nodes-with-label` and `--relations-with-type` allows the node label or relationship type to be explicitly written instead of inferring them from the filename. For example, `--relations-with-type HAS_TAG post_hasTag_tag.csv` will add all relationships described in the specified CSV with the type `HAS_TAG`.
 
 ## Input constraints
 ### Node identifiers
 - If both nodes and relations are being created, each node must be associated with a unique identifier.
-- The identifier is the first column of each label CSV file. If this column's name starts with an underscore (`_`), the identifier is internal to the bulk loader operation and does not appear in the resulting graph. Otherwise, it is treated as a node property.
-- Each identifier must be entirely unique across all label files.
+- If not using `--enforce-schema`, the identifier is the first column of each label CSV file. If this column's name starts with an underscore (`_`), the identifier is internal to the bulk loader operation and does not appear in the resulting graph. Otherwise, it is treated as a node property.
+- Each identifier must be entirely unique across all label files. [ID namespaces](#id-namespaces) can be used to write more granular identifiers.
 - Source and destination nodes in relation CSV files should be referred to by their identifiers.
 - The uniqueness restriction is lifted if only nodes are being created.
 
@@ -76,22 +84,44 @@ Notice that when -f isn't used, the robot name "30165" would be inserted as a nu
 - Properties are not required to be exclusively composed of any type.
 - The types currently supported by the bulk loader are:
     - `boolean`: either `true` or `false` (case-insensitive, not quote-interpolated).
-    - `numeric`: an unquoted value that can be read as a floating-point or integer type.
+    - `integer`: an unquoted value that can be read as an integer type.
+    - `double`: an unquoted value that can be read as a floating-point type.
     - `string`: any field that is either quote-interpolated or cannot be casted to a numeric or boolean type.
-    - `NULL`: an empty field.
-- Default behaviour is to infer the property type, attempting to cast it to null, float, boolean or string in that order. 
-- If explicit type is required, for example, if a value is "1234" and it must not be inferred into a float, you can use the option -f to specify the type explicitly for each row being imported. 
+- Cypher does not allow NULL values to be assigned to properties.
+- The default behaviour is to infer the property type, attempting to cast it to integer, float, boolean, or string in that order. 
+- The `--enforce-schema` flag and an [Input Schema](#input-schemas) should be used if type inference is not desired.
 
 ### Label file format:
 - Each row must have the same number of fields.
 - Leading and trailing whitespace is ignored.
-- The first field of a label file will be the node identifier, as described in [Node Identifiers](#node-identifiers).
+- If not using an [Input Schema](#input-schemas), the first field of a label file will be the node identifier, as described in [Node Identifiers](#node-identifiers).
 - All fields are property keys that will be associated with each node.
 
 ### Relationship files
 - Each row must have the same number of fields.
 - Leading and trailing whitespace is ignored.
-- The first two fields of each row are the source and destination node identifiers. The names of these fields in the header do not matter.
+- If not using an [Input Schema](#input-schemas), the first two fields of each row are the source and destination node identifiers. The names of these fields in the header do not matter.
 - If the file has more than 2 fields, all subsequent fields are relationship properties that adhere to the same rules as node properties.
 - Described relationships are always considered to be directed (source->destination).
 
+## Input Schemas
+If the `--enforce-schema` flag is specified, all input CSVs will be expected to specify each column's data type in the header.
+
+This format lifts some constraints of the default CSV format, such as ID fields being the first column.
+
+Most header fields should be a colon-separated pair of the property name and its data type, such as `Name:STRING`. Certain data types do not require a name string, as indicated below.
+
+The accepted data types are:
+|     Type String      | Description                                                       | Requires name string |
+|:--------------------:|-------------------------------------------------------------------|:--------------------:|
+|          ID          | Label files only - Unique identifier for a node                   |       Optional       |
+|       START_ID       | Relation files only - The ID field of this relation's source      |          No          |
+|        END_ID        | Relation files only - The ID field of this relation's destination |          No          |
+|        IGNORE        | This column will not be added to the graph                        |       Optional       |
+|    DOUBLE / FLOAT    | A signed 64-bit floating-point value                              |         Yes          |
+| INT / INTEGER / LONG | A signed 64-bit integer value                                     |         Yes          |
+|       BOOLEAN        | A boolean value indicated by the string 'true' or 'false'         |         Yes          |
+
+
+### ID Namespaces
+TODO

From 18bb1aa2defd3d5676049fb05974223ab8d1f491 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 15:41:20 -0400
Subject: [PATCH 13/26] Various improvements

---
 README.md                             |  1 +
 redisgraph_bulk_loader/__init__.py    |  2 -
 redisgraph_bulk_loader/entity_file.py |  9 ++---
 test/test_bulk_loader.py              | 55 +++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 5a7957a..2aca13f 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ The accepted data types are:
 | INT / INTEGER / LONG | A signed 64-bit integer value                                     |         Yes          |
 |       BOOLEAN        | A boolean value indicated by the string 'true' or 'false'         |         Yes          |
 
+If an `ID` column has a name string, the value will be added to each node as a property. Otherwise, it is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
 
 ### ID Namespaces
 TODO
diff --git a/redisgraph_bulk_loader/__init__.py b/redisgraph_bulk_loader/__init__.py
index 58aea9a..f63197d 100644
--- a/redisgraph_bulk_loader/__init__.py
+++ b/redisgraph_bulk_loader/__init__.py
@@ -1,5 +1,3 @@
-# TODO delete if unused
-#  from redisgraph_bulk_loader import bulk_insert
 __all__ = [
     'bulk_insert',
 ]
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index 35c2b91..dd39ff9 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -28,7 +28,6 @@ def convert_schema_type(in_type):
                 'double': Type.DOUBLE,
                 'float': Type.DOUBLE,
                 'string': Type.STRING,
-                'string[]': Type.STRING, # TODO tmp
                 'integer': Type.LONG,
                 'int': Type.LONG,
                 'long': Type.LONG,
@@ -50,13 +49,13 @@ def convert_schema_type(in_type):
 
 
 # Convert a single CSV property field into a binary stream.
-# Supported property types are string, numeric, boolean, and NULL.
-# type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
+# Supported property types are string, integer, float, boolean, and (erroneously) null.
 def prop_to_binary(prop_val, prop_type):
     # All format strings start with an unsigned char to represent our prop_type enum
     format_str = "=B"
-    if prop_val is None:
-        # An empty field indicates a NULL property
+    if prop_val == "":
+        # An empty string indicates a NULL property.
+        # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
         return struct.pack(format_str, Type.NULL)
 
     # If field can be cast to a float, allow it
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index f6f6d1d..8e54258 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -489,6 +489,61 @@ def test10_invalid_schema(self):
         # Expect an error.
         self.validate_exception(res, "Could not parse")
 
+    def test11_schema_ignore_columns(self):
+        """Validate that columns with the type IGNORE are not inserted."""
+
+        graphname = "ignore_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['str_col:STRING', 'ignore_col:IGNORE'])
+            out.writerow(['str1', 0])
+            out.writerow(['str2', 1])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          '--enforce-schema',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
+        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        # The nodes should only have the 'str_col' property
+        node_1 = {'str_col': 'str1'}
+        node_2 = {'str_col': 'str2'}
+        self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        self.assertEqual(query_result.result_set[1][0].properties, node_2)
+
+    def test12_no_null_values(self):
+        """Validate that NULL inputs are not inserted."""
+
+        graphname = "null_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['str_col', 'mixed_col'])
+            out.writerow(['str1', True])
+            out.writerow(['str2', None])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
+        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        # Only the first node should only have the 'mixed_col' property
+        node_1 = {'str_col': 'str1', 'mixed_col': True}
+        node_2 = {'str_col': 'str2'}
+        self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        self.assertEqual(query_result.result_set[1][0].properties, node_2)
+
 
 if __name__ == '__main__':
     unittest.main()

From 1ba45d09e453185a71a244cd9ea22f51f02f1e1e Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 16:34:20 -0400
Subject: [PATCH 14/26] Consolidate skip_offset and column_names

---
 redisgraph_bulk_loader/entity_file.py   | 47 +++++++++----------------
 redisgraph_bulk_loader/label.py         |  8 ++---
 redisgraph_bulk_loader/relation_type.py | 10 ++----
 3 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index dd39ff9..83abf85 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -147,7 +147,7 @@ def pack_header(self):
         fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
         args = [entity_bytes, self.prop_count]
         for idx in range(self.column_count):
-            if self.skip_offsets[idx]:
+            if not self.column_names[idx]:
                 continue
             prop = self.column_names[idx].encode()
             fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
@@ -163,50 +163,38 @@ def convert_header_with_schema(self, header):
             if len(pair) > 2:
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
 
-            # No colon in name, this column does not have an enforced data type.
-            if len(pair) == 1:
-                self.types[idx] = Type.INFERRED
-                self.column_names[idx] = pair[0]
-                if idx == 0:
-                    self.types[idx] = Type.ID # TODO problematic
-                continue
+            # Convert the column type.
+            col_type = convert_schema_type(pair[1].casefold())
 
-            if len(pair[0]) == 0: # Delete empty string in a case like ":LABEL"
-                del pair[0]
-
-            if len(pair) == 1:
-                # We have a type, but no column name.
-                # TODO fold into above case?
-                self.types[idx] = convert_schema_type(pair[0].casefold())
-                self.skip_offsets[idx] = True
-                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
-                    # Any other field should have 2 elements
-                    raise SchemaError("Each property in the header should be a colon-separated pair")
+            # If the column did not have a name but the type requires one, emit an error.
+            if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+                raise SchemaError("Each property in the header should be a colon-separated pair")
             else:
                 # We have a column name and a type.
-                self.column_names[idx] = pair[0]
-                self.types[idx] = convert_schema_type(pair[1].casefold())
-                if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE):
-                    self.skip_offsets[idx] = True
+                # Only store the name if the column's values should be added as properties.
+                if col_type not in (Type.START_ID, Type.END_ID, Type.IGNORE):
+                    self.column_names[idx] = pair[0]
+
+            # Store the column type.
+            self.types[idx] = col_type
 
     def convert_header(self):
         header = next(self.reader)
         self.column_count = len(header)
-        self.column_names = [None] * self.column_count   # Property names of every column.
+        self.column_names = [None] * self.column_count   # Property names of every column; None if column does not update graph.
         self.types = [Type.INFERRED] * self.column_count # Value type of every column.
-        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
 
         if Config.enforce_schema:
             # Use generic logic to convert the header with schema.
             self.convert_header_with_schema(header)
             # The subclass will perform post-processing.
-            self.post_process_header(header)
+            self.post_process_header_with_schema(header)
         else:
             # The subclass will process the header itself
             self.process_schemaless_header(header)
 
         # The number of properties is equal to the number of non-skipped columns.
-        self.prop_count = self.skip_offsets.count(False)
+        self.prop_count = self.column_count - self.column_names.count(None)
         self.packed_header = self.pack_header()
         self.binary_size += len(self.packed_header)
 
@@ -214,10 +202,9 @@ def convert_header(self):
     def pack_props(self, line):
         props = []
         for idx, field in enumerate(line):
-            if self.skip_offsets[idx]:
+            if not self.column_names[idx]:
                 continue
-            if self.column_names[idx]:
-                props.append(prop_to_binary(field, self.types[idx]))
+            props.append(prop_to_binary(field, self.types[idx]))
         return b''.join(p for p in props)
 
     def to_binary(self):
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 416bb5c..74e7e3b 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -19,14 +19,14 @@ def process_schemaless_header(self, header):
         # If this starts with an underscore, it is not a property and should not be introduced to the graph.
         self.types[0] = Type.ID
         self.id = 0
-        if header[0][0] == '_':
-            self.skip_offsets[0] = True
-        #  self.types[1:] = [Type.INFERRED] * self.column_count - 1
 
         for idx, field in enumerate(header):
             self.column_names[idx] = field
 
-    def post_process_header(self, header):
+        if header[0][0] == '_':
+            self.column_names[0] = None
+
+    def post_process_header_with_schema(self, header):
         # No ID field is required if we're only inserting nodes.
         if Config.store_node_identifiers is False:
             return
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 43b1d86..54b1832 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -23,17 +23,13 @@ def process_schemaless_header(self, header):
         # The first column is the source ID and the second is the destination ID.
         self.types[0] = Type.START_ID
         self.types[1] = Type.END_ID
-        self.skip_offsets[0] = True
-        self.skip_offsets[1] = True
         self.start_namespace = None
         self.end_namespace = None
 
-        #  self.types[2:] = [Type.INFERRED] * self.column_count - 2
+        for idx, field in enumerate(header[2:]):
+            self.column_names[idx+2] = field
 
-        for idx, field in enumerate(header):
-            self.column_names[idx] = field
-
-    def post_process_header(self, header):
+    def post_process_header_with_schema(self, header):
         # Can interleave these tasks if preferred.
         if self.types.count(Type.START_ID) != 1:
             raise SchemaError("Relation file '%s' should have exactly one START_ID column."

From a1de8dcb653ab10aab05eb9d9c1afb09674e9128 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 17:01:49 -0400
Subject: [PATCH 15/26] Separate binary packing logic for typed and inferred
 values

---
 redisgraph_bulk_loader/entity_file.py | 109 +++++++++++++++++---------
 test/test_bulk_loader.py              |   2 +-
 2 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index 83abf85..8073e58 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -23,19 +23,19 @@ class Type:
 def convert_schema_type(in_type):
     try:
         return {
-                'null': Type.NULL,
-                'boolean': Type.BOOL,
-                'double': Type.DOUBLE,
-                'float': Type.DOUBLE,
-                'string': Type.STRING,
-                'integer': Type.LONG,
-                'int': Type.LONG,
-                'long': Type.LONG,
-                'id': Type.ID,
-                'start_id': Type.START_ID,
-                'end_id': Type.END_ID,
-                'ignore': Type.IGNORE
-                }[in_type]
+            'null': Type.NULL,
+            'boolean': Type.BOOL,
+            'double': Type.DOUBLE,
+            'float': Type.DOUBLE,
+            'string': Type.STRING,
+            'integer': Type.LONG,
+            'int': Type.LONG,
+            'long': Type.LONG,
+            'id': Type.ID,
+            'start_id': Type.START_ID,
+            'end_id': Type.END_ID,
+            'ignore': Type.IGNORE
+        }[in_type]
     except KeyError:
         # TODO tmp
         if in_type.startswith('id('):
@@ -48,45 +48,41 @@ def convert_schema_type(in_type):
             raise SchemaError("Encountered invalid field type '%s'" % in_type)
 
 
-# Convert a single CSV property field into a binary stream.
-# Supported property types are string, integer, float, boolean, and (erroneously) null.
-def prop_to_binary(prop_val, prop_type):
+# Convert a property field with an enforced type into a binary stream.
+# Supported property types are string, integer, float, and boolean.
+def typed_prop_to_binary(prop_val, prop_type):
     # All format strings start with an unsigned char to represent our prop_type enum
     format_str = "=B"
-    if prop_val == "":
-        # An empty string indicates a NULL property.
-        # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
-        return struct.pack(format_str, Type.NULL)
+    # TODO allow ID type specification
+    if prop_type == Type.ID or prop_type == Type.LONG:
+        try:
+            numeric_prop = int(prop_val)
+            return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+        except ValueError:
+            # TODO ugly, rethink
+            if prop_type == Type.LONG:
+                raise SchemaError("Could not parse '%s' as a long" % prop_val)
 
-    # If field can be cast to a float, allow it
-    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.DOUBLE:
+    elif prop_type == Type.ID or prop_type == Type.DOUBLE:
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
                 return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
-        except:
+        except ValueError:
             # TODO ugly, rethink
             if prop_type == Type.DOUBLE:
                 raise SchemaError("Could not parse '%s' as a double" % prop_val)
 
-    #  if prop_type is None or prop_type == Type.LONG or prop_type == Type.ID:
-    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.LONG:
-        try:
-            numeric_prop = int(float(prop_val))
-            return struct.pack(format_str + "q", Type.LONG, numeric_prop)
-        except:
-            # TODO ugly, rethink
-            if prop_type == Type.LONG:
-                raise SchemaError("Could not parse '%s' as a long" % prop_val)
-
-    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.BOOL:
+    elif prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
             return struct.pack(format_str + '?', Type.BOOL, False)
         elif prop_val.lower() == 'true':
             return struct.pack(format_str + '?', Type.BOOL, True)
+        else:
+            raise SchemaError("Could not parse '%s' as a boolean" % prop_val)
 
-    if prop_type in (Type.INFERRED, Type.ID) or prop_type == Type.STRING:
+    elif prop_type == Type.STRING:
         # If we've reached this point, the property is a string
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
@@ -97,6 +93,44 @@ def prop_to_binary(prop_val, prop_type):
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
 
 
+# Convert a single CSV property field with an inferred type into a binary stream.
+# Supported property types are string, integer, float, boolean, and (erroneously) null.
+def inferred_prop_to_binary(prop_val):
+    # All format strings start with an unsigned char to represent our prop_type enum
+    format_str = "=B"
+    if prop_val == "":
+        # An empty string indicates a NULL property.
+        # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
+        return struct.pack(format_str, Type.NULL)
+
+    # Try to parse value as an integer.
+    try:
+        numeric_prop = int(prop_val)
+        return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+    except ValueError:
+        pass
+
+    # Try to parse value as a float.
+    try:
+        numeric_prop = float(prop_val)
+        if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
+            return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
+    except ValueError:
+        pass
+
+    # If field is 'false' or 'true', it is a boolean.
+    if prop_val.lower() == 'false':
+        return struct.pack(format_str + '?', Type.BOOL, False)
+    elif prop_val.lower() == 'true':
+        return struct.pack(format_str + '?', Type.BOOL, True)
+
+    # If we've reached this point, the property is a string.
+    encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
+    # Encoding len+1 adds a null terminator to the string
+    format_str += "%ds" % (len(encoded_str) + 1)
+    return struct.pack(format_str, Type.STRING, encoded_str)
+
+
 # Superclass for label and relation CSV files
 class EntityFile(object):
     def __init__(self, filename, label):
@@ -204,7 +238,10 @@ def pack_props(self, line):
         for idx, field in enumerate(line):
             if not self.column_names[idx]:
                 continue
-            props.append(prop_to_binary(field, self.types[idx]))
+            if Config.enforce_schema:
+                props.append(typed_prop_to_binary(field, self.types[idx]))
+            else:
+                props.append(inferred_prop_to_binary(field))
         return b''.join(p for p in props)
 
     def to_binary(self):
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 8e54258..07e52b6 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -436,7 +436,7 @@ def test08_nonstandard_separators(self):
 
         graph = Graph(graphname, self.redis_con)
         query_result = graph.query('MATCH (a) RETURN a.prop_a, a.prop_b, a.prop_c ORDER BY a.prop_a, a.prop_b, a.prop_c')
-        expected_result = [['val1', 5.0, True],
+        expected_result = [['val1', 5, True],
                            [10.5, 'a', False]]
 
         # The graph should have the correct types for all properties

From 57d041fb7ba22028c7f90a55a3d6c530fa173267 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 20 May 2020 17:25:33 -0400
Subject: [PATCH 16/26] Improving logic around types

---
 redisgraph_bulk_loader/entity_file.py   | 65 +++++++++++--------------
 redisgraph_bulk_loader/label.py         |  1 -
 redisgraph_bulk_loader/relation_type.py |  2 -
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index 8073e58..e0d6fe2 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -3,46 +3,37 @@
 import csv
 import math
 import struct
+from enum import Enum
 from config import Config
 from exceptions import CSVError, SchemaError
 
 
-class Type:
-    NULL = 0
+class Type(Enum):
+    UNKNOWN = 0
     BOOL = 1
     DOUBLE = 2
+    FLOAT = 2       # alias to DOUBLE
     STRING = 3
     LONG = 4
+    INT = 4         # alias to LONG
+    INTEGER = 4     # alias to LONG
     ID = 5
-    START_ID = 8
-    END_ID = 9
-    IGNORE = 10
-    INFERRED = 11
+    START_ID = 6
+    END_ID = 7
+    IGNORE = 8
 
 
 def convert_schema_type(in_type):
     try:
-        return {
-            'null': Type.NULL,
-            'boolean': Type.BOOL,
-            'double': Type.DOUBLE,
-            'float': Type.DOUBLE,
-            'string': Type.STRING,
-            'integer': Type.LONG,
-            'int': Type.LONG,
-            'long': Type.LONG,
-            'id': Type.ID,
-            'start_id': Type.START_ID,
-            'end_id': Type.END_ID,
-            'ignore': Type.IGNORE
-        }[in_type]
+        return Type[in_type]
     except KeyError:
-        # TODO tmp
-        if in_type.startswith('id('):
+        # Handling for ID namespaces
+        # TODO think of better alternatives
+        if in_type.startswith('ID('):
             return Type.ID
-        elif in_type.startswith('start_id('):
+        elif in_type.startswith('START_ID('):
             return Type.START_ID
-        elif in_type.startswith('end_id('):
+        elif in_type.startswith('END_ID('):
             return Type.END_ID
         else:
             raise SchemaError("Encountered invalid field type '%s'" % in_type)
@@ -57,7 +48,7 @@ def typed_prop_to_binary(prop_val, prop_type):
     if prop_type == Type.ID or prop_type == Type.LONG:
         try:
             numeric_prop = int(prop_val)
-            return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+            return struct.pack(format_str + "q", Type.LONG.value, numeric_prop)
         except ValueError:
             # TODO ugly, rethink
             if prop_type == Type.LONG:
@@ -67,7 +58,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
-                return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
+                return struct.pack(format_str + "d", Type.DOUBLE.value, numeric_prop)
         except ValueError:
             # TODO ugly, rethink
             if prop_type == Type.DOUBLE:
@@ -76,9 +67,9 @@ def typed_prop_to_binary(prop_val, prop_type):
     elif prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
-            return struct.pack(format_str + '?', Type.BOOL, False)
+            return struct.pack(format_str + '?', Type.BOOL.value, False)
         elif prop_val.lower() == 'true':
-            return struct.pack(format_str + '?', Type.BOOL, True)
+            return struct.pack(format_str + '?', Type.BOOL.value, True)
         else:
             raise SchemaError("Could not parse '%s' as a boolean" % prop_val)
 
@@ -87,7 +78,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
         format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, Type.STRING, encoded_str)
+        return struct.pack(format_str, Type.STRING.value, encoded_str)
 
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
@@ -101,12 +92,12 @@ def inferred_prop_to_binary(prop_val):
     if prop_val == "":
         # An empty string indicates a NULL property.
         # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
-        return struct.pack(format_str, Type.NULL)
+        return struct.pack(format_str, 0)
 
     # Try to parse value as an integer.
     try:
         numeric_prop = int(prop_val)
-        return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+        return struct.pack(format_str + "q", Type.LONG.value, numeric_prop)
     except ValueError:
         pass
 
@@ -114,21 +105,21 @@ def inferred_prop_to_binary(prop_val):
     try:
         numeric_prop = float(prop_val)
         if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
-            return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
+            return struct.pack(format_str + "d", Type.DOUBLE.value, numeric_prop)
     except ValueError:
         pass
 
     # If field is 'false' or 'true', it is a boolean.
     if prop_val.lower() == 'false':
-        return struct.pack(format_str + '?', Type.BOOL, False)
+        return struct.pack(format_str + '?', Type.BOOL.value, False)
     elif prop_val.lower() == 'true':
-        return struct.pack(format_str + '?', Type.BOOL, True)
+        return struct.pack(format_str + '?', Type.BOOL.value, True)
 
     # If we've reached this point, the property is a string.
     encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
     # Encoding len+1 adds a null terminator to the string
     format_str += "%ds" % (len(encoded_str) + 1)
-    return struct.pack(format_str, Type.STRING, encoded_str)
+    return struct.pack(format_str, Type.STRING.value, encoded_str)
 
 
 # Superclass for label and relation CSV files
@@ -189,6 +180,7 @@ def pack_header(self):
         return struct.pack(fmt, *args)
 
     def convert_header_with_schema(self, header):
+        self.types = [None] * self.column_count # Value type of every column.
         for idx, field in enumerate(header):
             pair = field.split(':')
 
@@ -198,7 +190,7 @@ def convert_header_with_schema(self, header):
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
 
             # Convert the column type.
-            col_type = convert_schema_type(pair[1].casefold())
+            col_type = convert_schema_type(pair[1].upper())
 
             # If the column did not have a name but the type requires one, emit an error.
             if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
@@ -216,7 +208,6 @@ def convert_header(self):
         header = next(self.reader)
         self.column_count = len(header)
         self.column_names = [None] * self.column_count   # Property names of every column; None if column does not update graph.
-        self.types = [Type.INFERRED] * self.column_count # Value type of every column.
 
         if Config.enforce_schema:
             # Use generic logic to convert the header with schema.
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 74e7e3b..8c3e927 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -17,7 +17,6 @@ def __init__(self, query_buffer, infile, label_str):
     def process_schemaless_header(self, header):
         # The first column is the ID.
         # If this starts with an underscore, it is not a property and should not be introduced to the graph.
-        self.types[0] = Type.ID
         self.id = 0
 
         for idx, field in enumerate(header):
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 54b1832..650e5e7 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -21,8 +21,6 @@ def process_schemaless_header(self, header):
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (self.infile.name))
         # The first column is the source ID and the second is the destination ID.
-        self.types[0] = Type.START_ID
-        self.types[1] = Type.END_ID
         self.start_namespace = None
         self.end_namespace = None
 

From 3f461cbc5dbd23d40fee37c55afab126fed4d386 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 21 May 2020 14:35:59 -0400
Subject: [PATCH 17/26] Documentation update

---
 README.md                             | 26 ++++++++++++++++++++++++--
 redisgraph_bulk_loader/bulk_insert.py | 21 +--------------------
 redisgraph_bulk_loader/config.py      | 20 ++++++++++++++++++++
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 2aca13f..16af92f 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ The flags for `max-token-count`, `max-buffer-size`, and `max-token-size` are typ
     - `double`: an unquoted value that can be read as a floating-point type.
     - `string`: any field that is either quote-interpolated or cannot be casted to a numeric or boolean type.
 - Cypher does not allow NULL values to be assigned to properties.
-- The default behaviour is to infer the property type, attempting to cast it to integer, float, boolean, or string in that order. 
+- The default behaviour is to infer the property type, attempting to cast it to integer, float, boolean, or string in that order.
 - The `--enforce-schema` flag and an [Input Schema](#input-schemas) should be used if type inference is not desired.
 
 ### Label file format:
@@ -125,4 +125,26 @@ The accepted data types are:
 If an `ID` column has a name string, the value will be added to each node as a property. Otherwise, it is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
 
 ### ID Namespaces
-TODO
+Typically, node identifiers need to be unique across all input CSVs. When using an input schema, it is (optionally) possible to create ID namespaces, and the identifier only needs to be unique across its namespace. This is particularly useful when each input CSV has primary keys which overlap with others.
+
+To introduce a namespace, follow the `:ID` type string with a parentheses-interpolated namespace string, such as `:ID(User)`. The same namespace should be specified in the `:START_ID` or `:END_ID` field of relation files, as in `:START_ID(User)`.
+
+### Input Schema CSV examples
+User.csv
+```
+:ID(User), name:STRING, rank:INT
+0, "Jeffrey", 5
+1, "Filipe", 8
+```
+
+FOLLOWS.csv
+```
+:START_ID(User), :END_ID(User), reaction_count:INT
+0, 1, 25
+1, 0, 10
+```
+Inserting these CSVs with the command:
+`redisgraph-bulk-loader SocialGraph --enforce-schema --nodes User.csv --relations FOLLOWS.csv`
+
+Will produce a graph named SocialGraph with 2 users, Jeffrey and Filipe. Jeffrey follows Filipe, and that relation has a reaction_count of 25. Filipe also follows Jeffrey, with a reaction_count of 10.
+
diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 9d44bb9..7bba1f2 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -5,7 +5,7 @@
 from timeit import default_timer as timer
 
 sys.path.append(os.path.dirname(__file__))
-from config import Config
+from config import Config, Config_Set
 from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
@@ -40,25 +40,6 @@ def process_entities(entities):
         entity.query_buffer.buffer_size += added_size
 
 
-def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
-    # Maximum number of tokens per query
-    # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
-    # that we can safely ignore tokens that aren't binary strings
-    # ("GRAPH.BULK", "BEGIN", graph name, counts)
-    Config.max_token_count = min(max_token_count, 1024 * 1023)
-    # Maximum size in bytes per query
-    Config.max_buffer_size = max_buffer_size * 1000000
-    # Maximum size in bytes per token
-    # 512 megabytes is a hard-coded Redis maximum
-    Config.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
-
-    Config.enforce_schema = enforce_schema
-    Config.skip_invalid_nodes = skip_invalid_nodes
-    Config.skip_invalid_edges = skip_invalid_edges
-    Config.separator = separator
-    Config.quoting = quoting
-
-
 # Command-line arguments
 @click.command()
 @click.argument('graph')
diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
index 2632adc..bfd57a2 100644
--- a/redisgraph_bulk_loader/config.py
+++ b/redisgraph_bulk_loader/config.py
@@ -9,3 +9,23 @@ class Config:
     store_node_identifiers = False
     separator = ','
     quoting = 3
+
+
+def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
+    """Settings for this run of the bulk loader"""
+    # Maximum number of tokens per query
+    # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
+    # that we can safely ignore tokens that aren't binary strings
+    # ("GRAPH.BULK", "BEGIN", graph name, counts)
+    Config.max_token_count = min(max_token_count, 1024 * 1023)
+    # Maximum size in bytes per query
+    Config.max_buffer_size = max_buffer_size * 1000000
+    # Maximum size in bytes per token
+    # 512 megabytes is a hard-coded Redis maximum
+    Config.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
+
+    Config.enforce_schema = enforce_schema
+    Config.skip_invalid_nodes = skip_invalid_nodes
+    Config.skip_invalid_edges = skip_invalid_edges
+    Config.separator = separator
+    Config.quoting = quoting

From efd5e9b3f65f977f0657233e7cda64f750d50d12 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 22 May 2020 15:53:33 -0400
Subject: [PATCH 18/26] Update tests

---
 test/test_bulk_loader.py | 42 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 07e52b6..df31a14 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -508,7 +508,6 @@ def test11_schema_ignore_columns(self):
         self.assertIn('2 nodes created', res.output)
 
         graph = Graph(graphname, self.redis_con)
-        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
         query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
 
         # The nodes should only have the 'str_col' property
@@ -535,7 +534,6 @@ def test12_no_null_values(self):
         self.assertIn('2 nodes created', res.output)
 
         graph = Graph(graphname, self.redis_con)
-        #  query_result = graph.query('MATCH (a) RETURN a.str_col, a.num_col ORDER BY a.num_col')
         query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
 
         # Only the first node should only have the 'mixed_col' property
@@ -544,6 +542,46 @@ def test12_no_null_values(self):
         self.assertEqual(query_result.result_set[0][0].properties, node_1)
         self.assertEqual(query_result.result_set[1][0].properties, node_2)
 
+    def test13_id_namespaces(self):
+        """Validate that ID namespaces allow for scoped identifiers."""
+
+        graphname = "namespace_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['id:ID(User)', 'name:STRING'])
+            out.writerow([0, 'Jeffrey'])
+            out.writerow([1, 'Filipe'])
+
+        with open('/tmp/nodes2.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['id:ID(Post)', 'views:INT'])
+            out.writerow([0, 20])
+            out.writerow([1, 40])
+
+        with open('/tmp/relations.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow([':START_ID(User), :END_ID(Post)'])
+            out.writerow([0, 0])
+            out.writerow([1, 1])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes-with-label', 'User', '/tmp/nodes.tmp',
+                                          '--nodes-with-label', 'Post', '/tmp/nodes2.tmp',
+                                          '--relations-with-type', 'AUTHOR', '/tmp/relations.tmp',
+                                          '--enforce-schema',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('4 nodes created', res.output)
+        self.assertIn("2 relations created", res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (src)-[]->(dest) RETURN src.id, src.name, LABELS(src), dest.id, dest.views, LABELS(dest) ORDER BY src.id')
+
+        expected_result = [[0, 'Jeffrey', 'User', 0, 20, 'Post'],
+                           [1, 'Filipe', 'User', 1, 40, 'Post']]
+        self.assertEqual(query_result.result_set, expected_result)
+
 
 if __name__ == '__main__':
     unittest.main()

From 199504476fb623eb5e836c1695e634a8656017fa Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 22 May 2020 16:02:25 -0400
Subject: [PATCH 19/26] Temporarily disable null-handling test

---
 test/test_bulk_loader.py | 51 ++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index df31a14..34aad4b 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -516,31 +516,32 @@ def test11_schema_ignore_columns(self):
         self.assertEqual(query_result.result_set[0][0].properties, node_1)
         self.assertEqual(query_result.result_set[1][0].properties, node_2)
 
-    def test12_no_null_values(self):
-        """Validate that NULL inputs are not inserted."""
-
-        graphname = "null_graph"
-        with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            out = csv.writer(csv_file)
-            out.writerow(['str_col', 'mixed_col'])
-            out.writerow(['str1', True])
-            out.writerow(['str2', None])
-
-        runner = CliRunner()
-        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          graphname], catch_exceptions=False)
-
-        self.assertEqual(res.exit_code, 0)
-        self.assertIn('2 nodes created', res.output)
-
-        graph = Graph(graphname, self.redis_con)
-        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
-
-        # Only the first node should only have the 'mixed_col' property
-        node_1 = {'str_col': 'str1', 'mixed_col': True}
-        node_2 = {'str_col': 'str2'}
-        self.assertEqual(query_result.result_set[0][0].properties, node_1)
-        self.assertEqual(query_result.result_set[1][0].properties, node_2)
+    # TODO enable after merge of RedisGraph PR #1108
+    #  def test12_no_null_values(self):
+        #  """Validate that NULL inputs are not inserted."""
+
+        #  graphname = "null_graph"
+        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            #  out = csv.writer(csv_file)
+            #  out.writerow(['str_col', 'mixed_col'])
+            #  out.writerow(['str1', True])
+            #  out.writerow(['str2', None])
+
+        #  runner = CliRunner()
+        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          #  graphname], catch_exceptions=False)
+
+        #  self.assertEqual(res.exit_code, 0)
+        #  self.assertIn('2 nodes created', res.output)
+
+        #  graph = Graph(graphname, self.redis_con)
+        #  query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        #  # Only the first node should only have the 'mixed_col' property
+        #  node_1 = {'str_col': 'str1', 'mixed_col': True}
+        #  node_2 = {'str_col': 'str2'}
+        #  self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        #  self.assertEqual(query_result.result_set[1][0].properties, node_2)
 
     def test13_id_namespaces(self):
         """Validate that ID namespaces allow for scoped identifiers."""

From 2b14c624354f6c6998fe2fec4a843257d2ada305 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 12 Jun 2020 10:15:17 -0400
Subject: [PATCH 20/26] Update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 16af92f..26e8c80 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,7 @@ The accepted data types are:
 |    DOUBLE / FLOAT    | A signed 64-bit floating-point value                              |         Yes          |
 | INT / INTEGER / LONG | A signed 64-bit integer value                                     |         Yes          |
 |       BOOLEAN        | A boolean value indicated by the string 'true' or 'false'         |         Yes          |
+|        STRING        | A string value                                                    |         Yes          |
 
 If an `ID` column has a name string, the value will be added to each node as a property. Otherwise, it is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
 

From 7b50b762996b18b88ea972e1f3918ff13dc7b4cf Mon Sep 17 00:00:00 2001
From: filipecosta90 <filipecosta.90@gmail.com>
Date: Sun, 14 Jun 2020 21:28:04 +0100
Subject: [PATCH 21/26] [add] added coverage report

---
 .circleci/config.yml     | 7 ++++---
 .gitignore               | 2 ++
 test/requirements.txt    | 7 +++++++
 test/test_bulk_loader.py | 2 ++
 4 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 test/requirements.txt

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b0553de..27d1d9c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -39,11 +39,12 @@ jobs:
           command: python setup.py sdist
 
       - run:
-          name: run tests
+          name: run tests and upload coverage
           command: |
             . venv/bin/activate
-            pip install git+https://github.com/RedisGraph/redisgraph-py.git@master
-            python -m unittest test/test_bulk_loader.py
+            pip install -r tests/requirements.txt
+            pytest --cov redisgraph_bulk_loader
+            codecov -t ${CODECOV_TOKEN}
 
 workflows:
   version: 2
diff --git a/.gitignore b/.gitignore
index fd917c1..53f735b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 
+.coverage
+
 # User-specific stuff
 .idea/**/workspace.xml
 .idea/**/tasks.xml
diff --git a/test/requirements.txt b/test/requirements.txt
new file mode 100644
index 0000000..5faeada
--- /dev/null
+++ b/test/requirements.txt
@@ -0,0 +1,7 @@
+click==6.7
+redisgraph==1.5
+redis==3.5.3
+redisgraph_bulk_loader==0.8.3
+pytest==4.6
+pytest-cov==2.9.0
+codecov==2.1.4
\ No newline at end of file
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index 34aad4b..aea2e6e 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -33,12 +33,14 @@ def setUpClass(cls):
         Instantiate a new Redis connection
         """
         cls.redis_con = redis.Redis(host='localhost', port=6379, decode_responses=True)
+        cls.redis_con.flushall()
 
     @classmethod
     def tearDownClass(cls):
         """Delete temporary files"""
         os.remove('/tmp/nodes.tmp')
         os.remove('/tmp/relations.tmp')
+        cls.redis_con.flushall()
 
     def validate_exception(self, res, expected_msg):
         self.assertNotEqual(res.exit_code, 0)

From 6787480ac41d5b31779dfaf67017ec645c88575b Mon Sep 17 00:00:00 2001
From: filipecosta90 <filipecosta.90@gmail.com>
Date: Sun, 14 Jun 2020 21:39:00 +0100
Subject: [PATCH 22/26] [add] added coverage report

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 27d1d9c..6ecba65 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -42,7 +42,7 @@ jobs:
           name: run tests and upload coverage
           command: |
             . venv/bin/activate
-            pip install -r tests/requirements.txt
+            pip install -r test/requirements.txt
             pytest --cov redisgraph_bulk_loader
             codecov -t ${CODECOV_TOKEN}
 

From 69e8579225d2fd16d37874469de02345e58e64c6 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 15 Jun 2020 11:18:37 -0400
Subject: [PATCH 23/26] Bump required redisgraph-py version

---
 test/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/requirements.txt b/test/requirements.txt
index 5faeada..9d04a4b 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1,5 +1,5 @@
 click==6.7
-redisgraph==1.5
+redisgraph==2.1.3
 redis==3.5.3
 redisgraph_bulk_loader==0.8.3
 pytest==4.6

From e01ceb75c8633f6042595d2e000337924ec6f3a6 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 15 Jun 2020 12:40:18 -0400
Subject: [PATCH 24/26] Address PR comments

---
 redisgraph_bulk_loader/config.py        |  8 ++--
 redisgraph_bulk_loader/relation_type.py |  1 -
 test/test_bulk_loader.py                | 51 ++++++++++++-------------
 test/test_config.py                     | 31 +++++++++++++++
 4 files changed, 60 insertions(+), 31 deletions(-)
 create mode 100644 test/test_config.py

diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
index bfd57a2..f7dc701 100644
--- a/redisgraph_bulk_loader/config.py
+++ b/redisgraph_bulk_loader/config.py
@@ -1,8 +1,8 @@
 class Config:
     """Default values for command-line arguments"""
     max_token_count = 1024 * 1023
-    max_buffer_size = 0
-    max_token_size = 512 * 1000000
+    max_buffer_size = 2_048_000_000
+    max_token_size = 512_000_000
     enforce_schema = False
     skip_invalid_nodes = False
     skip_invalid_edges = False
@@ -19,10 +19,10 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema,
     # ("GRAPH.BULK", "BEGIN", graph name, counts)
     Config.max_token_count = min(max_token_count, 1024 * 1023)
     # Maximum size in bytes per query
-    Config.max_buffer_size = max_buffer_size * 1000000
+    Config.max_buffer_size = max_buffer_size * 1_000_000
     # Maximum size in bytes per token
     # 512 megabytes is a hard-coded Redis maximum
-    Config.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
+    Config.max_token_size = min(max_token_size * 1_000_000, 512 * 1_000_000)
 
     Config.enforce_schema = enforce_schema
     Config.skip_invalid_nodes = skip_invalid_nodes
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 650e5e7..61cae2d 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -14,7 +14,6 @@ def __init__(self, query_buffer, infile, type_str):
         self.start_id = 0
         self.end_id = 1
         self.query_buffer = query_buffer
-        #  self.post_process_header()
 
     def process_schemaless_header(self, header):
         if self.column_count < 2:
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
index aea2e6e..1b95b7b 100644
--- a/test/test_bulk_loader.py
+++ b/test/test_bulk_loader.py
@@ -518,32 +518,31 @@ def test11_schema_ignore_columns(self):
         self.assertEqual(query_result.result_set[0][0].properties, node_1)
         self.assertEqual(query_result.result_set[1][0].properties, node_2)
 
-    # TODO enable after merge of RedisGraph PR #1108
-    #  def test12_no_null_values(self):
-        #  """Validate that NULL inputs are not inserted."""
-
-        #  graphname = "null_graph"
-        #  with open('/tmp/nodes.tmp', mode='w') as csv_file:
-            #  out = csv.writer(csv_file)
-            #  out.writerow(['str_col', 'mixed_col'])
-            #  out.writerow(['str1', True])
-            #  out.writerow(['str2', None])
-
-        #  runner = CliRunner()
-        #  res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
-                                          #  graphname], catch_exceptions=False)
-
-        #  self.assertEqual(res.exit_code, 0)
-        #  self.assertIn('2 nodes created', res.output)
-
-        #  graph = Graph(graphname, self.redis_con)
-        #  query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
-
-        #  # Only the first node should only have the 'mixed_col' property
-        #  node_1 = {'str_col': 'str1', 'mixed_col': True}
-        #  node_2 = {'str_col': 'str2'}
-        #  self.assertEqual(query_result.result_set[0][0].properties, node_1)
-        #  self.assertEqual(query_result.result_set[1][0].properties, node_2)
+    def test12_no_null_values(self):
+        """Validate that NULL inputs are not inserted."""
+
+        graphname = "null_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['str_col', 'mixed_col'])
+            out.writerow(['str1', True])
+            out.writerow(['str2', None])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        # Only the first node should only have the 'mixed_col' property
+        node_1 = {'str_col': 'str1', 'mixed_col': True}
+        node_2 = {'str_col': 'str2'}
+        self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        self.assertEqual(query_result.result_set[1][0].properties, node_2)
 
     def test13_id_namespaces(self):
         """Validate that ID namespaces allow for scoped identifiers."""
diff --git a/test/test_config.py b/test/test_config.py
new file mode 100644
index 0000000..1a7cb96
--- /dev/null
+++ b/test/test_config.py
@@ -0,0 +1,31 @@
+import unittest
+from redisgraph_bulk_loader.config import Config, Config_Set
+
+
+class TestBulkLoader(unittest.TestCase):
+    def test01_default_values(self):
+        """Verify the default values in the Config class."""
+
+        self.assertEqual(Config.max_token_count, 1024 * 1023)
+        self.assertEqual(Config.max_buffer_size, 2_048_000_000)
+        self.assertEqual(Config.max_token_size, 512_000_000)
+        self.assertEqual(Config.enforce_schema, False)
+        self.assertEqual(Config.skip_invalid_nodes, False)
+        self.assertEqual(Config.skip_invalid_edges, False)
+        self.assertEqual(Config.store_node_identifiers, False)
+        self.assertEqual(Config.separator, ',')
+        self.assertEqual(Config.quoting, 3)
+
+    def test02_config_set(self):
+        """Verify that Config_set updates Config class values accordingly."""
+        Config_Set(max_token_count=10, max_buffer_size=100, max_token_size=200, enforce_schema=True, skip_invalid_nodes=True, skip_invalid_edges=True, separator='|', quoting=0)
+        self.assertEqual(Config.max_token_count, 10)
+        self.assertEqual(Config.max_token_size, 200_000_000) # Max token size argument is converted to megabytes
+        self.assertEqual(Config.max_buffer_size, 100_000_000) # Buffer size argument is converted to megabytes
+
+        self.assertEqual(Config.enforce_schema, True)
+        self.assertEqual(Config.skip_invalid_nodes, True)
+        self.assertEqual(Config.skip_invalid_edges, True)
+        self.assertEqual(Config.store_node_identifiers, False)
+        self.assertEqual(Config.separator, '|')
+        self.assertEqual(Config.quoting, 0)

From 09705cb72db33dcac296dc8fb18c4c857ec9721c Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 15 Jun 2020 14:35:06 -0400
Subject: [PATCH 25/26] Replace Config global with instance

---
 redisgraph_bulk_loader/bulk_insert.py   | 26 +++++++-------
 redisgraph_bulk_loader/config.py        | 47 ++++++++++---------------
 redisgraph_bulk_loader/entity_file.py   | 14 ++++----
 redisgraph_bulk_loader/label.py         | 13 ++++---
 redisgraph_bulk_loader/query_buffer.py  |  7 ++--
 redisgraph_bulk_loader/relation_type.py |  9 +++--
 test/test_config.py                     | 45 ++++++++++++-----------
 7 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
index 7bba1f2..956ec5b 100644
--- a/redisgraph_bulk_loader/bulk_insert.py
+++ b/redisgraph_bulk_loader/bulk_insert.py
@@ -5,22 +5,22 @@
 from timeit import default_timer as timer
 
 sys.path.append(os.path.dirname(__file__))
-from config import Config, Config_Set
+from config import Config
 from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
 
 
-def parse_schemas(cls, query_buf, path_to_csv, csv_tuples):
+def parse_schemas(cls, query_buf, path_to_csv, csv_tuples, config):
     schemas = [None] * (len(path_to_csv) + len(csv_tuples))
     for idx, in_csv in enumerate(path_to_csv):
         # Build entity descriptor from input CSV
-        schemas[idx] = cls(query_buf, in_csv, None)
+        schemas[idx] = cls(query_buf, in_csv, None, config)
 
     offset = len(path_to_csv)
     for idx, csv_tuple in enumerate(csv_tuples):
         # Build entity descriptor from input CSV
-        schemas[idx + offset] = cls(query_buf, csv_tuple[1], csv_tuple[0])
+        schemas[idx + offset] = cls(query_buf, csv_tuple[1], csv_tuple[0], config)
     return schemas
 
 
@@ -31,8 +31,8 @@ def process_entities(entities):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (entity.query_buffer.buffer_size + added_size >= Config.max_buffer_size
-                or entity.query_buffer.redis_token_count + len(entity.binary_entities) >= Config.max_token_count):
+        if (entity.query_buffer.buffer_size + added_size >= entity.config.max_buffer_size
+                or entity.query_buffer.redis_token_count + len(entity.binary_entities) >= entity.config.max_token_count):
             # Send and flush the buffer if appropriate
             entity.query_buffer.send_buffer()
         # Add binary data to list and update all counts
@@ -71,11 +71,11 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
 
     start_time = timer()
 
-    # Initialize configurations with command-line arguments
-    Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
-
     # If relations are being built, we must store unique node identifiers to later resolve endpoints.
-    Config.store_node_identifiers = any(relations) or any(relations_with_type)
+    store_node_identifiers = any(relations) or any(relations_with_type)
+
+    # Initialize configurations with command-line arguments
+    config = Config(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, int(quote), store_node_identifiers)
 
     # Attempt to connect to Redis server
     try:
@@ -100,11 +100,11 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    query_buf = QueryBuffer(graph, client)
+    query_buf = QueryBuffer(graph, client, config)
 
     # Read the header rows of each input CSV and save its schema.
-    labels = parse_schemas(Label, query_buf, nodes, nodes_with_label)
-    reltypes = parse_schemas(RelationType, query_buf, relations, relations_with_type)
+    labels = parse_schemas(Label, query_buf, nodes, nodes_with_label, config)
+    reltypes = parse_schemas(RelationType, query_buf, relations, relations_with_type, config)
 
     process_entities(labels)
     process_entities(reltypes)
diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
index f7dc701..5174cb7 100644
--- a/redisgraph_bulk_loader/config.py
+++ b/redisgraph_bulk_loader/config.py
@@ -1,31 +1,22 @@
 class Config:
-    """Default values for command-line arguments"""
-    max_token_count = 1024 * 1023
-    max_buffer_size = 2_048_000_000
-    max_token_size = 512_000_000
-    enforce_schema = False
-    skip_invalid_nodes = False
-    skip_invalid_edges = False
-    store_node_identifiers = False
-    separator = ','
-    quoting = 3
+    def __init__(self, max_token_count=1024 * 1023, max_buffer_size=2_048, max_token_size=512, enforce_schema=False, skip_invalid_nodes=False, skip_invalid_edges=False, separator=',', quoting=3, store_node_identifiers=False):
+        """Settings for this run of the bulk loader"""
+        # Maximum number of tokens per query
+        # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
+        # that we can safely ignore tokens that aren't binary strings
+        # ("GRAPH.BULK", "BEGIN", graph name, counts)
+        self.max_token_count = min(max_token_count, 1024 * 1023)
+        # Maximum size in bytes per query
+        self.max_buffer_size = max_buffer_size * 1_000_000
+        # Maximum size in bytes per token
+        # 512 megabytes is a hard-coded Redis maximum
+        self.max_token_size = min(max_token_size * 1_000_000, 512 * 1_000_000)
 
+        self.enforce_schema = enforce_schema
+        self.skip_invalid_nodes = skip_invalid_nodes
+        self.skip_invalid_edges = skip_invalid_edges
+        self.separator = separator
+        self.quoting = quoting
 
-def Config_Set(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
-    """Settings for this run of the bulk loader"""
-    # Maximum number of tokens per query
-    # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
-    # that we can safely ignore tokens that aren't binary strings
-    # ("GRAPH.BULK", "BEGIN", graph name, counts)
-    Config.max_token_count = min(max_token_count, 1024 * 1023)
-    # Maximum size in bytes per query
-    Config.max_buffer_size = max_buffer_size * 1_000_000
-    # Maximum size in bytes per token
-    # 512 megabytes is a hard-coded Redis maximum
-    Config.max_token_size = min(max_token_size * 1_000_000, 512 * 1_000_000)
-
-    Config.enforce_schema = enforce_schema
-    Config.skip_invalid_nodes = skip_invalid_nodes
-    Config.skip_invalid_edges = skip_invalid_edges
-    Config.separator = separator
-    Config.quoting = quoting
+        # True if we are building relations as well as nodes
+        self.store_node_identifiers = store_node_identifiers
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index e0d6fe2..0b3cb61 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -4,7 +4,6 @@
 import math
 import struct
 from enum import Enum
-from config import Config
 from exceptions import CSVError, SchemaError
 
 
@@ -124,7 +123,10 @@ def inferred_prop_to_binary(prop_val):
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, filename, label):
+    def __init__(self, filename, label, config):
+        # The configurations for this run.
+        self.config = config
+
         # The label or relation type string is the basename of the file
         if label:
             self.entity_str = label
@@ -135,7 +137,7 @@ def __init__(self, filename, label):
 
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=Config.separator, skipinitialspace=True, quoting=Config.quoting)
+        self.reader = csv.reader(self.infile, delimiter=config.separator, skipinitialspace=True, quoting=config.quoting)
 
         self.packed_header = b''
         self.binary_entities = []
@@ -158,7 +160,7 @@ def validate_row(self, row):
         # Each row should have the same number of fields
         if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), Config.separator.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), self.config.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
@@ -209,7 +211,7 @@ def convert_header(self):
         self.column_count = len(header)
         self.column_names = [None] * self.column_count   # Property names of every column; None if column does not update graph.
 
-        if Config.enforce_schema:
+        if self.config.enforce_schema:
             # Use generic logic to convert the header with schema.
             self.convert_header_with_schema(header)
             # The subclass will perform post-processing.
@@ -229,7 +231,7 @@ def pack_props(self, line):
         for idx, field in enumerate(line):
             if not self.column_names[idx]:
                 continue
-            if Config.enforce_schema:
+            if self.config.enforce_schema:
                 props.append(typed_prop_to_binary(field, self.types[idx]))
             else:
                 props.append(inferred_prop_to_binary(field))
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 8c3e927..1856d19 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -1,17 +1,16 @@
 import re
 import sys
 import click
-from config import Config
 from entity_file import Type, EntityFile
 from exceptions import SchemaError
 
 
 class Label(EntityFile):
     """Handler class for processing Label CSV files."""
-    def __init__(self, query_buffer, infile, label_str):
+    def __init__(self, query_buffer, infile, label_str, config):
         self.id_namespace = None
         self.query_buffer = query_buffer
-        super(Label, self).__init__(infile, label_str)
+        super(Label, self).__init__(infile, label_str, config)
         #  self.post_process_header()
 
     def process_schemaless_header(self, header):
@@ -27,7 +26,7 @@ def process_schemaless_header(self, header):
 
     def post_process_header_with_schema(self, header):
         # No ID field is required if we're only inserting nodes.
-        if Config.store_node_identifiers is False:
+        if self.config.store_node_identifiers is False:
             return
 
         # Verify that exactly one field is labeled ID.
@@ -46,7 +45,7 @@ def update_node_dictionary(self, identifier):
         if identifier in self.query_buffer.nodes:
             sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                              % (identifier, self.infile.name, self.reader.line_num))
-            if Config.skip_invalid_nodes is False:
+            if self.config.skip_invalid_nodes is False:
                 sys.exit(1)
         self.query_buffer.nodes[identifier] = self.query_buffer.top_node_id
         self.query_buffer.top_node_id += 1
@@ -58,7 +57,7 @@ def process_entities(self):
                 self.validate_row(row)
 
                 # Update the node identifier dictionary if necessary
-                if Config.store_node_identifiers:
+                if self.config.store_node_identifiers:
                     id_field = row[self.id]
                     if self.id_namespace is not None:
                         id_field = self.id_namespace + '.' + str(id_field)
@@ -69,7 +68,7 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 # TODO how much of this can be made uniform w/ relations and moved to Querybuffer?
-                if self.binary_size + row_binary_len > Config.max_token_size:
+                if self.binary_size + row_binary_len > self.config.max_token_size:
                     self.query_buffer.labels.append(self.to_binary())
                     self.query_buffer.send_buffer()
                     self.reset_partial_binary()
diff --git a/redisgraph_bulk_loader/query_buffer.py b/redisgraph_bulk_loader/query_buffer.py
index a72e70c..e3b2934 100644
--- a/redisgraph_bulk_loader/query_buffer.py
+++ b/redisgraph_bulk_loader/query_buffer.py
@@ -1,8 +1,5 @@
-from config import Config
-
-
 class QueryBuffer:
-    def __init__(self, graphname, client):
+    def __init__(self, graphname, client, config):
         self.nodes = None
         self.top_node_id = 0
 
@@ -11,7 +8,7 @@ def __init__(self, graphname, client):
         self.graphname = graphname
 
         # Create a node dictionary if we're building relations and as such require unique identifiers
-        if Config.store_node_identifiers:
+        if config.store_node_identifiers:
             self.nodes = {}
         else:
             self.nodes = None
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 61cae2d..43afc3f 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -1,15 +1,14 @@
 import re
 import struct
 import click
-from config import Config
 from entity_file import Type, EntityFile
 from exceptions import CSVError, SchemaError
 
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, query_buffer, infile, type_str):
-        super(RelationType, self).__init__(infile, type_str)
+    def __init__(self, query_buffer, infile, type_str, config):
+        super(RelationType, self).__init__(infile, type_str, config)
 
         self.start_id = 0
         self.end_id = 1
@@ -62,7 +61,7 @@ def process_entities(self):
                     dest = self.query_buffer.nodes[end_id]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
-                    if Config.skip_invalid_edges is False:
+                    if self.config.skip_invalid_edges is False:
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest
@@ -70,7 +69,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > Config.max_token_size:
+                if self.binary_size + row_binary_len > self.config.max_token_size:
                     self.query_buffer.reltypes.append(self.to_binary())
                     self.query_buffer.send_buffer()
                     self.reset_partial_binary()
diff --git a/test/test_config.py b/test/test_config.py
index 1a7cb96..9ff4e10 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -1,31 +1,30 @@
 import unittest
-from redisgraph_bulk_loader.config import Config, Config_Set
+from redisgraph_bulk_loader.config import Config
 
 
 class TestBulkLoader(unittest.TestCase):
     def test01_default_values(self):
         """Verify the default values in the Config class."""
+        config = Config()
+        self.assertEqual(config.max_token_count, 1024 * 1023)
+        self.assertEqual(config.max_buffer_size, 2_048_000_000)
+        self.assertEqual(config.max_token_size, 512_000_000)
+        self.assertEqual(config.enforce_schema, False)
+        self.assertEqual(config.skip_invalid_nodes, False)
+        self.assertEqual(config.skip_invalid_edges, False)
+        self.assertEqual(config.store_node_identifiers, False)
+        self.assertEqual(config.separator, ',')
+        self.assertEqual(config.quoting, 3)
 
-        self.assertEqual(Config.max_token_count, 1024 * 1023)
-        self.assertEqual(Config.max_buffer_size, 2_048_000_000)
-        self.assertEqual(Config.max_token_size, 512_000_000)
-        self.assertEqual(Config.enforce_schema, False)
-        self.assertEqual(Config.skip_invalid_nodes, False)
-        self.assertEqual(Config.skip_invalid_edges, False)
-        self.assertEqual(Config.store_node_identifiers, False)
-        self.assertEqual(Config.separator, ',')
-        self.assertEqual(Config.quoting, 3)
-
-    def test02_config_set(self):
+    def test02_modified_values(self):
         """Verify that Config_set updates Config class values accordingly."""
-        Config_Set(max_token_count=10, max_buffer_size=100, max_token_size=200, enforce_schema=True, skip_invalid_nodes=True, skip_invalid_edges=True, separator='|', quoting=0)
-        self.assertEqual(Config.max_token_count, 10)
-        self.assertEqual(Config.max_token_size, 200_000_000) # Max token size argument is converted to megabytes
-        self.assertEqual(Config.max_buffer_size, 100_000_000) # Buffer size argument is converted to megabytes
-
-        self.assertEqual(Config.enforce_schema, True)
-        self.assertEqual(Config.skip_invalid_nodes, True)
-        self.assertEqual(Config.skip_invalid_edges, True)
-        self.assertEqual(Config.store_node_identifiers, False)
-        self.assertEqual(Config.separator, '|')
-        self.assertEqual(Config.quoting, 0)
+        config = Config(max_token_count=10, max_buffer_size=100, max_token_size=200, enforce_schema=True, skip_invalid_nodes=True, skip_invalid_edges=True, separator='|', quoting=0)
+        self.assertEqual(config.max_token_count, 10)
+        self.assertEqual(config.max_token_size, 200_000_000) # Max token size argument is converted to megabytes
+        self.assertEqual(config.max_buffer_size, 100_000_000) # Buffer size argument is converted to megabytes
+        self.assertEqual(config.enforce_schema, True)
+        self.assertEqual(config.skip_invalid_nodes, True)
+        self.assertEqual(config.skip_invalid_edges, True)
+        self.assertEqual(config.store_node_identifiers, False)
+        self.assertEqual(config.separator, '|')
+        self.assertEqual(config.quoting, 0)

From f54957e7ddbab2bfdb4db87abb45842406d6b8b8 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Tue, 16 Jun 2020 12:15:48 -0400
Subject: [PATCH 26/26] Additional test files

---
 redisgraph_bulk_loader/entity_file.py   |  2 +-
 redisgraph_bulk_loader/label.py         |  1 -
 redisgraph_bulk_loader/relation_type.py |  5 +--
 test/test_label.py                      | 50 +++++++++++++++++++++++++
 test/test_relation_type.py              | 49 ++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 5 deletions(-)
 create mode 100644 test/test_label.py
 create mode 100644 test/test_relation_type.py

diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
index 0b3cb61..592ab13 100644
--- a/redisgraph_bulk_loader/entity_file.py
+++ b/redisgraph_bulk_loader/entity_file.py
@@ -121,8 +121,8 @@ def inferred_prop_to_binary(prop_val):
     return struct.pack(format_str, Type.STRING.value, encoded_str)
 
 
-# Superclass for label and relation CSV files
 class EntityFile(object):
+    """Superclass for Label and RelationType classes"""
     def __init__(self, filename, label, config):
         # The configurations for this run.
         self.config = config
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
index 1856d19..aebd32f 100644
--- a/redisgraph_bulk_loader/label.py
+++ b/redisgraph_bulk_loader/label.py
@@ -11,7 +11,6 @@ def __init__(self, query_buffer, infile, label_str, config):
         self.id_namespace = None
         self.query_buffer = query_buffer
         super(Label, self).__init__(infile, label_str, config)
-        #  self.post_process_header()
 
     def process_schemaless_header(self, header):
         # The first column is the ID.
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
index 43afc3f..e67adfc 100644
--- a/redisgraph_bulk_loader/relation_type.py
+++ b/redisgraph_bulk_loader/relation_type.py
@@ -9,9 +9,6 @@
 class RelationType(EntityFile):
     def __init__(self, query_buffer, infile, type_str, config):
         super(RelationType, self).__init__(infile, type_str, config)
-
-        self.start_id = 0
-        self.end_id = 1
         self.query_buffer = query_buffer
 
     def process_schemaless_header(self, header):
@@ -19,6 +16,8 @@ def process_schemaless_header(self, header):
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (self.infile.name))
         # The first column is the source ID and the second is the destination ID.
+        self.start_id = 0
+        self.end_id = 1
         self.start_namespace = None
         self.end_namespace = None
 
diff --git a/test/test_label.py b/test/test_label.py
new file mode 100644
index 0000000..646e2ae
--- /dev/null
+++ b/test/test_label.py
@@ -0,0 +1,50 @@
+import os
+import csv
+import unittest
+from redisgraph_bulk_loader.config import Config
+from redisgraph_bulk_loader.label import Label
+
+
+class TestBulkLoader(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls):
+        """Delete temporary files"""
+        os.remove('/tmp/labels.tmp')
+
+    def test01_process_schemaless_header(self):
+        """Verify that a schema-less header is parsed properly."""
+        with open('/tmp/labels.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['_ID', 'prop'])
+            out.writerow([0, 'prop1'])
+            out.writerow([1, 'prop2'])
+
+        config = Config()
+        label = Label(None, '/tmp/labels.tmp', 'LabelTest', config)
+
+        # The '_ID' column will not be stored, as the underscore indicates a private identifier.
+        self.assertEqual(label.column_names, [None, 'prop'])
+        self.assertEqual(label.column_count, 2)
+        self.assertEqual(label.id, 0)
+        self.assertEqual(label.entity_str, 'LabelTest')
+        self.assertEqual(label.prop_count, 1)
+        self.assertEqual(label.entities_count, 2)
+
+    def test02_process_header_with_schema(self):
+        """Verify that a header with a schema is parsed properly."""
+        with open('/tmp/labels.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['id:ID(IDNamespace)', 'property:STRING'])
+            out.writerow([0, 0, 'prop1'])
+            out.writerow([1, 1, 'prop2'])
+
+        config = Config(enforce_schema=True, store_node_identifiers=True)
+        label = Label(None, '/tmp/labels.tmp', 'LabelTest', config)
+        self.assertEqual(label.column_names, ['id', 'property'])
+        self.assertEqual(label.column_count, 2)
+        self.assertEqual(label.id_namespace, 'IDNamespace')
+        self.assertEqual(label.entity_str, 'LabelTest')
+        self.assertEqual(label.prop_count, 2)
+        self.assertEqual(label.entities_count, 2)
+        self.assertEqual(label.types[0].name, 'ID')
+        self.assertEqual(label.types[1].name, 'STRING')
diff --git a/test/test_relation_type.py b/test/test_relation_type.py
new file mode 100644
index 0000000..cb8c42c
--- /dev/null
+++ b/test/test_relation_type.py
@@ -0,0 +1,49 @@
+import os
+import csv
+import unittest
+from redisgraph_bulk_loader.config import Config
+from redisgraph_bulk_loader.relation_type import RelationType
+
+
+class TestBulkLoader(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls):
+        """Delete temporary files"""
+        os.remove('/tmp/relations.tmp')
+
+    def test01_process_schemaless_header(self):
+        """Verify that a schema-less header is parsed properly."""
+        with open('/tmp/relations.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['START_ID', 'END_ID', 'property'])
+            out.writerow([0, 0, 'prop1'])
+            out.writerow([1, 1, 'prop2'])
+
+        config = Config()
+        reltype = RelationType(None, '/tmp/relations.tmp', 'RelationTest', config)
+        self.assertEqual(reltype.start_id, 0)
+        self.assertEqual(reltype.end_id, 1)
+        self.assertEqual(reltype.entity_str, 'RelationTest')
+        self.assertEqual(reltype.prop_count, 1)
+        self.assertEqual(reltype.entities_count, 2)
+
+    def test02_process_header_with_schema(self):
+        """Verify that a header with a schema is parsed properly."""
+        with open('/tmp/relations.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['End:END_ID(EndNamespace)', 'Start:START_ID(StartNamespace)', 'property:STRING'])
+            out.writerow([0, 0, 'prop1'])
+            out.writerow([1, 1, 'prop2'])
+
+        config = Config(enforce_schema=True)
+        reltype = RelationType(None, '/tmp/relations.tmp', 'RelationTest', config)
+        self.assertEqual(reltype.start_id, 1)
+        self.assertEqual(reltype.start_namespace, 'StartNamespace')
+        self.assertEqual(reltype.end_id, 0)
+        self.assertEqual(reltype.end_namespace, 'EndNamespace')
+        self.assertEqual(reltype.entity_str, 'RelationTest')
+        self.assertEqual(reltype.prop_count, 1)
+        self.assertEqual(reltype.entities_count, 2)
+        self.assertEqual(reltype.types[0].name, 'END_ID')
+        self.assertEqual(reltype.types[1].name, 'START_ID')
+        self.assertEqual(reltype.types[2].name, 'STRING')