In [18]:
import pandas as pd
from pathlib import Path
import json

In [30]:
# load data, set language field using file name
data = []
for p in Path('../data').iterdir():
    if p.name != "records.jsonl" and p.name.__contains__("jsonl"):
        lang = p.name.split("-")[0]
        with open(p, 'r') as file:
            for line in file:
                line_data = json.loads(line)
                line_data['language'] = lang
                data.append(line_data)

# remove unneeded quotes from some fields
for record in data:
    record['casenumber'] = record['casenumber'].strip("\"")
    for result in record['results']:
        result['word'] = result['word'].strip("\"")

data[0]

{'casenumber': 'ME2021-10793',
 'results': [{'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 3e-06,
   'word': 'PNEUMONIA'},
  {'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 1e-06,
   'word': 'NOVEL'},
  {'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 1.083e-06,
   'word': 'CORONA'},
  {'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 1.291e-06,
   'word': 'COVID-19'},
  {'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 8.75e-07,
   'word': 'VIRAL'},
  {'distance': 0,
   'level': 'primary_combined',
   'metric': 'NormalizedLevenshtein',
   'time': 1.375e-06,
   'word': 'INFECTION'}],
 'language': 'go'}

In [31]:
table = []
for case in data:
    for result in case['results']:
        result['casenumber'] = case['casenumber']
        result['language'] = case['language']
        table.append(result)

table[:2]

[{'distance': 0,
  'level': 'primary_combined',
  'metric': 'NormalizedLevenshtein',
  'time': 3e-06,
  'word': 'PNEUMONIA',
  'casenumber': 'ME2021-10793',
  'language': 'go'},
 {'distance': 0,
  'level': 'primary_combined',
  'metric': 'NormalizedLevenshtein',
  'time': 1e-06,
  'word': 'NOVEL',
  'casenumber': 'ME2021-10793',
  'language': 'go'}]

In [32]:
df = pd.DataFrame(table)
df.distance = df.distance.astype(float)
df.time = df.time.astype(float)
df.shape

(2463054, 7)

In [33]:
df[df.distance > 0].shape

(1231583, 7)

In [34]:
# python > go > rust in capturing records somehow
# shouldn't we all have X number of words and thus X matches?
df.language.value_counts()

go        821018
python    821018
rust      821018
Name: language, dtype: int64

In [35]:
df.metric.value_counts()

NormalizedLevenshtein    1231527
JaroWinkler              1231527
Name: metric, dtype: int64

In [26]:
df.sample(1000).to_csv("output.csv")

In [36]:
df.sample()

Unnamed: 0,distance,level,metric,time,word,casenumber,language
1911503,0.875,primary_combined,NormalizedLevenshtein,1.2e-05,TOXICITY,ME2018-05119,python


In [37]:
df[df.language == 'rust'].sample(1000)

Unnamed: 0,distance,level,metric,time,word,casenumber,language
1187551,0.0,primary_combined,NormalizedLevenshtein,1.660000e-07,INTOXICATION,ME2016-00132,rust
1589794,1.0,primary_combined,JaroWinkler,8.300000e-08,BLUNT,ME2016-01517,rust
850328,0.0,primary_combined,NormalizedLevenshtein,1.250000e-07,FAILURE,ME2021-06046,rust
1334413,1.0,primary_combined,JaroWinkler,8.300000e-08,COVID-19,ME2020-14037,rust
1538637,1.0,primary_combined,JaroWinkler,1.250000e-07,TOXICITY,ME2017-04429,rust
...,...,...,...,...,...,...,...
1235177,1.0,primary_combined,JaroWinkler,2.080000e-07,COVID-19,ME2021-10080,rust
1116390,0.0,primary_combined,NormalizedLevenshtein,1.250000e-07,RUPTURED,ME2018-00404,rust
1329084,1.0,primary_combined,JaroWinkler,8.300000e-08,CORONA,ME2020-14545,rust
1193834,0.0,primary_combined,NormalizedLevenshtein,1.250000e-07,DISEASE,ME2015-04709,rust


In [38]:
# python takes longest
# jaro generally faster than levenshtein
groups = df.groupby(['language', 'metric', 'level'])['time'].agg(['mean', 'count'])
groups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,count
language,metric,level,Unnamed: 3_level_1,Unnamed: 4_level_1
go,JaroWinkler,primary_combined,2.517679e-07,296831
go,JaroWinkler,secondarycause,2.647812e-07,113678
go,NormalizedLevenshtein,primary_combined,3.796005e-07,296831
go,NormalizedLevenshtein,secondarycause,4.173001e-07,113678
python,JaroWinkler,primary_combined,4.539956e-06,296831
python,JaroWinkler,secondarycause,4.766962e-06,113678
python,NormalizedLevenshtein,primary_combined,1.139496e-05,296831
python,NormalizedLevenshtein,secondarycause,1.252671e-05,113678
rust,JaroWinkler,primary_combined,9.870224e-08,296831
rust,JaroWinkler,secondarycause,1.07783e-07,113678


In [11]:
df.distance.describe().round(3)

count    1437111.000
mean           0.285
std            0.451
min            0.000
25%            0.000
50%            0.000
75%            1.000
max            1.000
Name: distance, dtype: float64

In [16]:
df.time.describe().round(3)

count    1437111.000
mean           0.000
std            0.000
min            0.000
25%            0.000
50%            0.000
75%            0.000
max            0.017
Name: time, dtype: float64