In [1]:
import lmdb
from tqdm import trange
import numpy as np

In [2]:
env = lmdb.open("lmbd_db", max_dbs=2, map_size=10**9)
out_nodes = env.open_db('out_nodes'.encode())
graph = 'txt_graphs/uk2007.graph-txt'

In [3]:
from tqdm import trange

graph = 'txt_graphs/uk2007.graph-txt'

with env.begin(write=True) as txn:
    with open(graph, 'r')  as f:
        line = f.readline()
        line_tot = int(line.split()[0])
        print("{} lines".format(line_tot))
        print("reading..")
        for i in trange(line_tot):
            line = f.readline()
            key = np.array([i], dtype='uint32')
            if line[0] == '\n':
                # node pointing at itself
                nums = np.array([i], dtype='uint32')
            else:
                nums = np.array([int(i) for i in line.split()], dtype='uint32')
            txn.put(key.tobytes(), nums.tobytes(), db=out_nodes)


  1%|          | 1167/100000 [00:00<00:08, 11666.31it/s]

100000 lines
reading..


100%|██████████| 100000/100000 [00:02<00:00, 36500.95it/s]


In [4]:
env.stat()

{'psize': 4096,
 'depth': 1,
 'branch_pages': 0,
 'leaf_pages': 1,
 'overflow_pages': 0,
 'entries': 2}

In [5]:
with env.begin(write=False) as txn:
    cursor = txn.cursor(out_nodes)
    i = 0
    for key, value in cursor:
        # print(key, value)
        print((np.frombuffer(key, dtype='uint32'),
               np.frombuffer(value, dtype='uint32')))
        i += 1
        if i > 1:
            break

(array([0], dtype=uint32), array([  1,   4,   5,   6,   7,   8,  13,  38,  48, 154], dtype=uint32))
(array([65536], dtype=uint32), array([17733, 55637, 55638, 55639, 55640, 55641, 55642, 65517, 65518,
       65519, 65532, 65533, 65534, 65535, 65537, 65541, 65542, 65543,
       65544, 65545, 65546, 65547, 65548, 65549, 65588, 65649, 65683],
      dtype=uint32))


In [6]:
%load_ext line_profiler

In [95]:
%load_ext Cython

In [106]:
%%cython -a

import numpy as np
import lmdb

env = lmdb.open("lmbd_db", max_dbs=2, map_size=10**9)
out_nodes = env.open_db('out_nodes'.encode())

def random_walk_(start, length, env):
    key = np.array([start], dtype='uint32').tobytes()
    walk = [0] * length
    with env.begin(write=False) as txn:
        cursor = txn.cursor(out_nodes)
        for i in range(length):
            walk[i] = key
            values = cursor.get(key)
            up = len(values) // 4
            ind = np.random.randint(up) * 4
            key = values[ind:ind+4]
    return np.frombuffer(np.array(walk), dtype='uint32')

In [98]:
print(random_walk_(65536, 10))

[65536 65543 65542 55640 55641 28458 28468 28463 28477 28477]


In [93]:
%lprun -f random_walk_ random_walk_(65536, 1000)

## Graph_db

In [33]:
import io

In [34]:
env2 = lmdb.open("graph_db", map_size=10**9)

with env2.begin(write=True) as txn:
    with open(graph, 'r') as f:
        line = f.readline()
        line_tot = int(line.split()[0])
        for i in trange(line_tot):
            line = f.readline()
            if line[0] == '\n':
                line = str(i)
            txn.put(str(i).encode('ascii'), line.rstrip().encode('ascii'))

100%|██████████| 100000/100000 [00:00<00:00, 214604.43it/s]


In [35]:
with env2.begin(write=False, buffers=True) as txn:
    cursor = txn.cursor()
    query = cursor.get('0'.encode('ascii'))
    print(query)

<memory at 0x7f2ae93b9888>


In [102]:
%%cython -a


def build(buf, el, ret):
    if buf[el] == 32:
        el += 1
        ret = str(buf[el])
    left = el
    while left > 0:
        left -= 1
        if buf[left] == 32:
            break
        else:
            ret = str(buf[left]) + ret
    right = el
    while right < len(buf) - 1:
        right += 1
        if buf[right] == 32:
            break
        else:
            ret += str(buf[right])
    return ret

In [37]:
for i in range(len(query)):
    print(build(query, i, str(query[i])))

49
52
52
53
53
54
54
55
55
56
56
4951
4951
4951
5156
5156
5156
5256
5256
5256
495352
495352
495352
495352


In [38]:
def random_sample(buf):
    el = np.random.randint(len(buf))
    return build(buf, el, str(buf[el]))

In [39]:
a = random_sample(query)

In [75]:
import numpy as np

def random_walk(start, walk_len=10):
    walk = [0] * walk_len
    with env2.begin(write=False) as txn:
        cursor = txn.cursor()
        for i in range(walk_len):
            walk[i] = start
            query = cursor.get(start)
            temp = bytes(query)
            temp = temp.split()
            start = temp[np.random.randint(len(temp))]
    return np.fromiter(map(int, walk), dtype=np.int)

In [76]:
random_walk(b'1')

array([  1,  38, 134,  42,  46,  45,  48,  46,  50,  39])

In [86]:
%lprun -f random_walk random_walk(b'65536', 1000)

In [None]:
def random_walk2(start, walk_len=10):
    walk = [0] * walk_len
    with env.begin(write=False) as txn:
        cursor = txn.cursor()

## Accessing the file

In [12]:
import linecache

In [28]:
def rw(node, file, walk_length):
    walk = [0] * walk_length
    for i in range(walk_length):
        walk[i] = node
        line = linecache.getline(file, node + 2)
        temp = line.split()
        node = int(temp[np.random.randint(len(temp))])
    return walk

In [29]:
rw(0, graph, 10)

[0, 154, 152, 46, 51, 79, 274, 38, 50, 201]

In [31]:
%lprun -f rw rw(0, graph, 1000)