In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Gingko

### In this example we use Wikipedia data generated by instructions on the README

In [2]:
from gingko.dataframes import get_wiki_urls

In [3]:
IDXS_COLS = ['section', 'subsection', 'paragraph', 'word']
wiki_urls = get_wiki_urls('data/wiki')
wiki_urls

{0: 'data/wiki/Gnosis.csv',
 1: 'data/wiki/Bogomil.csv',
 2: 'data/wiki/Gnostic.csv',
 3: 'data/wiki/Catholic_Church.csv'}

## 1) Pre-processing

### Choose our favorite tokenizer to apply to the data

In [4]:
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir=None, use_fast=True)

### Load a single article to pandas

In [6]:
from gingko.dataframes import load_wiki_df, format_wiki_df

In [7]:
wiki_df = format_wiki_df(load_wiki_df(wiki_urls[0]), tokenizer.tokenize, idxs_cols=IDXS_COLS)
wiki_df.head(10)

Unnamed: 0,word,paragraph,subsection,section,data
0,0,0,1,0,g
1,1,0,1,0,##nosis
2,2,1,1,0,g
3,3,1,1,0,##nosis
4,4,1,1,0,is
5,5,1,1,0,the
6,6,1,1,0,common
7,7,1,1,0,greek
8,8,1,1,0,noun
9,9,1,1,0,for


## 2) Single document

### Load a single article to Gingko

In [8]:
from gingko import Gingko

In [9]:
tree = Gingko(
    indices=wiki_df[IDXS_COLS].values.T,
    values=wiki_df.data.tolist()
)

### Gingko is fundamentally a COO-format n-dimensional sparse tensor

In [10]:
tree.shape
tree.indices[:,:10]
tree.values[:10]

(6, 4, 8, 207)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

['g', '##nosis', 'g', '##nosis', 'is', 'the', 'common', 'greek', 'noun', 'for']

### Gingo has hierarchical index pointers that make it sliceable

In [11]:
tree.ptrs

[array([ 0,  1,  2,  6,  8,  9, 10]),
 array([ 0,  2, 10, 15, 17, 20, 23, 26, 28, 30, 31]),
 array([   0,    2,   83,   84,  184,  323,  328,  342,  376,  469,  540,
         545,  548,  587,  627,  650,  652,  676,  683,  776,  821,  825,
         989, 1182, 1183, 1185, 1300, 1302, 1509, 1512, 1672, 1674])]

### Now it's easy to slice the first 20 words in the 2nd paragraph of the 1st subsection of the 3rd section

In [12]:
tree[3:4,:1,2:3,:20]

Gingko(indices=array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19]]), values=['knowledge', '(', 'or', 'g', '##nosis', ')', 'in', 'sufi', '##sm', 'refers', 'to', 'knowledge', 'of', 'self', 'and', 'god', '.', 'the', 'g', '##nostic'], ptrs=[array([0, 1]), array([0, 1]), array([ 0, 20])], check=False)

### The main function of a sparse tensor is to save space

In [13]:
import torch
import numpy as np

In [14]:
from gingko.utils import load_pickle, save_pickle

In [15]:
wiki_df = format_wiki_df(load_wiki_df(wiki_urls[0]), tokenizer.encode, idxs_cols=IDXS_COLS)

In [16]:
tree = Gingko(
    indices=wiki_df[IDXS_COLS].values.T,
    values=wiki_df.data.tolist()
)

In [17]:
data = {'indices':tree[:].indices, 'values':tree[:].values}
np.save('test.npy', torch.sparse_coo_tensor(**data).to_dense().numpy())
save_pickle(data, 'test.pkl')

### Saving a Gingko tree takes an order of magnitude less space than saving a dense numpy array

In [18]:
import os
os.path.getsize('test.npy')
os.path.getsize('test.pkl')

321152

60898

### ...and Gingko is fast

In [25]:
%%timeit

tree[:2,:,:,:1]

131 µs ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [20]:
%%timeit

_ = Gingko(
    indices=wiki_df[IDXS_COLS].values.T,
    values=wiki_df.data.tolist()
)

622 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 3) Multi-document corpus

### The hierarchical format of Gingko generalizes naturally to a multi-document corpus on different topics

In [21]:
from src.dataframes import load_wiki_corpus

In [22]:
wiki_urls

{0: 'data/wiki/Gnosis.csv',
 1: 'data/wiki/Bogomil.csv',
 2: 'data/wiki/Gnostic.csv',
 3: 'data/wiki/Catholic_Church.csv'}

### Since different topics are labled by the zeroth index, now you can easily build a topic classifier

In [23]:
multitree = Gingko(**load_wiki_corpus(wiki_urls, tokenizer.encode, IDXS_COLS))
multitree.shape

(4, 15, 9, 32, 389)

### Now it's easy to slice the first 20 words in the 2nd paragraph of the 1st subsection of the 3rd section of the 2nd article

In [24]:
multitree[1,3:4,:1,2:3,:20]

Gingko(indices=array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19]]), values=[101, 2028, 1997, 1996, 5700, 3017, 7037, 2923, 17831, 2015, 1010, 7871, 3258, 2964, 1010, 7940, 1999, 10110, 1006, 1999], ptrs=[array([0, 1]), array([0, 1]), array([0, 1]), array([ 0, 20])], check=False)

### Great! Now we can easily build a badass PyTorch model!