In [None]:
import dask.bag as db
import json
import os
fldr_in = "data_bag/json/"     
fldr_out = "data_bag/csv"

In [None]:
b = db.read_text(os.path.join(fldr_in, '*.json')).map(json.loads)
record = b.take(4)[0]

In [None]:
record

In [4]:
import pandas as pd

In [5]:
df = pd.json_normalize(record)

In [6]:
df

Unnamed: 0,age,name,occupation,telephone,address.address,address.city,credit-card.number,credit-card.expiration-date
0,52,"[Zulema, Walters]",Pools Collector,949-313-2340,1159 Crestwell Pike,Providence,3485 248758 62500,04/22


In [8]:
df[["name0", "name1"]] = pd.DataFrame(df['name'].tolist())
df = df.drop(columns=["name"])

In [9]:
df

Unnamed: 0,age,occupation,telephone,address.address,address.city,credit-card.number,credit-card.expiration-date,name0,name1
0,52,Pools Collector,949-313-2340,1159 Crestwell Pike,Providence,3485 248758 62500,04/22,Zulema,Walters


In [19]:
def flatten(record):
    df = pd.json_normalize(record)
    df[["name0", "name1"]] = pd.DataFrame(df['name'].tolist())
    df = df.drop(columns=["name"])
    return df

In [21]:
out = db.read_text(f'{fldr}/*.json')\
        .map(json.loads)\
        .map_partitions(lambda part: pd.concat([pd.json_normalize(x) for x in part]))

In [76]:
db.read_text(f'{fldr}/0.json')\
  .map(json.loads)\
  .map(flatten).starmap(pd.concat)

dask.bag<concat, npartitions=1>

In [103]:
fldr_in = "data_bag/json"
fldr_out = "data_bag/csv"

In [96]:
data = []
for line in open(f'{fldr}/0.json', 'r'):
    record = json.loads(line)
    data.append(flatten(record))
data = pd.concat(data)

In [14]:
fns = os.listdir(fldr_in)

In [22]:
def convert_json(fn, fldr_in, fldr_out):
    os.makedirs(fldr_out, exist_ok=True)
    data = []
    for line in open(os.path.join(fldr_in, fn), 'r'):
        record = json.loads(line)
        data.append(flatten(record))
    data = pd.concat(data)
    data.to_csv(os.path.join(fldr_out, fn), index=False)

In [108]:
%%time
for fn in fns:
    convert_json(fn, fldr_in, fldr_out)

CPU times: user 42.3 s, sys: 35.3 ms, total: 42.4 s
Wall time: 42.4 s


In [109]:
from dask import delayed, compute

In [111]:
from dask.distributed import Client, LocalCluster
from dask import delayed, compute

cluster = LocalCluster(n_workers=4)
client = Client(cluster)

In [112]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:39711  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.50 GB


In [113]:
%%time
to_compute = [delayed(convert_json)(fn, fldr_in, fldr_out)
              for fn in fns]
out = compute(to_compute)

CPU times: user 2.4 s, sys: 305 ms, total: 2.71 s
Wall time: 27.6 s


In [118]:
%%time
b = db.from_sequence(fns)\
      .map(lambda fn: convert_json(fn, fldr_in, fldr_out))
out = b.compute()

CPU times: user 2.03 s, sys: 239 ms, total: 2.27 s
Wall time: 23.7 s


In [15]:
fn = fns[0]

In [16]:
%%time
df = convert_json_faster(fn, fldr_in, fldr_out)

CPU times: user 1.53 s, sys: 0 ns, total: 1.53 s
Wall time: 1.54 s


In [55]:
def convert_json_faster(fn, fldr_in, fldr_out):
    os.makedirs(fldr_out, exist_ok=True)
    data = []
    for line in open(os.path.join(fldr_in, fn), 'r'):
        record = json.loads(line)
        data.append(pd.json_normalize(record))
    data = pd.concat(data)
    data[["name0", "name1"]] = pd.DataFrame(data['name'].tolist())
    data = data.drop(columns=["name"])
    data.to_csv(os.path.join(fldr_out, fn), index=False)

In [26]:
%%time
df = convert_json_faster1(fn, fldr_in, fldr_out)

CPU times: user 1.49 s, sys: 0 ns, total: 1.49 s
Wall time: 1.49 s


In [29]:
%%time
df = convert_json_faster1(fn, fldr_in, fldr_out)

CPU times: user 1.55 s, sys: 3.23 ms, total: 1.55 s
Wall time: 1.55 s


In [32]:
%%time
b = db.from_sequence(fns)\
      .map(lambda fn: convert_json(fn, fldr_in, fldr_out))
out = b.compute()

CPU times: user 53.4 ms, sys: 52.5 ms, total: 106 ms
Wall time: 23 s


In [34]:
%%time
b = db.from_sequence(fns)\
      .map(lambda fn: convert_json_faster2(fn, fldr_in, fldr_out))
out = b.compute()

CPU times: user 47.2 ms, sys: 37.1 ms, total: 84.3 ms
Wall time: 9.04 s


In [39]:
%load_ext line_profiler

In [44]:
%lprun -f convert_json convert_json(fn, fldr_in, fldr_out)

Timer unit: 1e-06 s

Total time: 9.47534 s
File: <ipython-input-22-ee5768c59c9d>
Function: convert_json at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def convert_json(fn, fldr_in, fldr_out):
     2         1        406.0    406.0      0.0      os.makedirs(fldr_out, exist_ok=True)
     3         1          3.0      3.0      0.0      data = []
     4      1001       5113.0      5.1      0.1      for line in open(os.path.join(fldr_in, fn), 'r'):
     5      1000      39711.0     39.7      0.4          record = json.loads(line)
     6      1000    9130633.0   9130.6     96.4          data.append(flatten(record))
     7         1     292755.0 292755.0      3.1      data = pd.concat(data)
     8         1       6716.0   6716.0      0.1      data.to_csv(os.path.join(fldr_out, fn), index=False)

1111.111111111111

In [46]:
%lprun -f flatten flatten(record)

Timer unit: 1e-06 s

Total time: 0.017553 s
File: <ipython-input-19-99731ee5e4b5>
Function: flatten at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def flatten(record):
     2         1       6474.0   6474.0     36.9      df = pd.json_normalize(record)
     3         1       6987.0   6987.0     39.8      df[["name0", "name1"]] = pd.DataFrame(df['name'].tolist())
     4         1       4089.0   4089.0     23.3      df = df.drop(columns=["name"])
     5         1          3.0      3.0      0.0      return df

In [48]:
4*1e6/60**2

1111.111111111111

In [49]:
60**2

3600

In [56]:
%lprun -f convert_json_faster convert_json_faster(fn, fldr_in, fldr_out)

Timer unit: 1e-06 s

Total time: 3.59581 s
File: <ipython-input-55-33fac27e5def>
Function: convert_json_faster at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def convert_json_faster(fn, fldr_in, fldr_out):
     2         1        399.0    399.0      0.0      os.makedirs(fldr_out, exist_ok=True)
     3         1          1.0      1.0      0.0      data = []
     4      1001       4342.0      4.3      0.1      for line in open(os.path.join(fldr_in, fn), 'r'):
     5      1000      37075.0     37.1      1.0          record = json.loads(line)
     6      1000    3257240.0   3257.2     90.6          data.append(pd.json_normalize(record))
     7         1     280729.0 280729.0      7.8      data = pd.concat(data)
     8         1       7015.0   7015.0      0.2      data[["name0", "name1"]] = pd.DataFrame(data['name'].tolist())
     9         1       2684.0   2684.0      0.1      data = data.drop(columns=["name"])
   

In [69]:
9.5/3.6

2.638888888888889

In [61]:
400*1e-6*1_0000_000

3999.9999999999995

In [63]:
400*1e-6

0.00039999999999999996

In [65]:
400*1e-6*1_0000_000/60**2

1.111111111111111

In [72]:
%timeit -n 10 1+1

47.8 ns ± 17 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [73]:
%timeit -n 10 -r 3 convert_json(fn, fldr_in, fldr_out)
%timeit -n 10 -r 3 convert_json_faster(fn, fldr_in, fldr_out)

4.1 s ± 16.6 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)
1.33 s ± 5.08 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [74]:
4.1/1.33

3.0827067669172927