In [1]:
import pandas as pd
from pathlib import Path
import json

In [2]:
# load data, set language field using file name
data = []
for p in Path('../data/third-party').iterdir():
    lang = p.name.split("-")[0]
    with open(p, 'r') as file:
        for line in file:
            line_data = json.loads(line)
            line_data['language'] = lang
            data.append(line_data)

# remove unneeded quotes from some fields
for record in data:
    record['casenumber'] = record['casenumber'].strip("\"")
    for result in record['results']:
        result['word'] = result['word'].strip("\"")

data[:2]

[{'casenumber': 'ME2021-10816',
  'results': [{'distance': 0,
    'level': 'primary_combined',
    'metric': 'Normalized Levenshtein',
    'time': 4.625e-06,
    'word': ''}],
  'language': 'go'},
 {'casenumber': 'ME2021-10815',
  'results': [{'distance': 0,
    'level': 'primary_combined',
    'metric': 'Normalized Levenshtein',
    'time': 9.17e-07,
    'word': ''}],
  'language': 'go'}]

In [3]:
table = []
for case in data:
    for result in case['results']:
        result['casenumber'] = case['casenumber']
        result['language'] = case['language']
        table.append(result)

table[:2]

[{'distance': 0,
  'level': 'primary_combined',
  'metric': 'Normalized Levenshtein',
  'time': 4.625e-06,
  'word': '',
  'casenumber': 'ME2021-10816',
  'language': 'go'},
 {'distance': 0,
  'level': 'primary_combined',
  'metric': 'Normalized Levenshtein',
  'time': 9.17e-07,
  'word': '',
  'casenumber': 'ME2021-10815',
  'language': 'go'}]

In [4]:
df = pd.DataFrame(table)
df.distance = df.distance.astype(float)
df.time = df.time.astype(float)
df.shape

(2381108, 7)

In [5]:
df[df.distance > 0].shape

(409783, 7)

In [6]:
# 17% of records have distance to heroin greater than 0
df[df.distance > 0].shape[0] / df.shape[0]

0.17209761170009927

In [7]:
# python > go > rust in capturing records somehow
# shouldn't we all have X number of words and thus X matches?
df.language.value_counts()

python    819566
go        781684
rust      779858
Name: language, dtype: int64

In [18]:
# rust takes longest
# python has higher average distance?
# jaro generally faster than levenshtein
groups = df.groupby(['language', 'metric', 'level']).aggregate(["count", "mean", "sum"])
groups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,distance,distance,distance,time,time,time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,sum,count,mean,sum
language,metric,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
go,Jaro-Winkler,primary_combined,277034,0.0,0.0,277034,4.105655e-07,0.113741
go,Jaro-Winkler,secondarycause,113808,0.0,0.0,113808,4.133425e-07,0.047042
go,Normalized Levenshtein,primary_combined,277034,0.0,0.0,277034,5.728318e-07,0.158694
go,Normalized Levenshtein,secondarycause,113808,0.0,0.0,113808,5.883698e-07,0.066961
python,Jaro-Winkler,primary_combined,296117,0.0,0.0,296117,4.836375e-06,1.432133
python,Jaro-Winkler,secondarycause,113666,0.0,0.0,113666,5.051103e-06,0.574139
python,Normalized Levenshtein,primary_combined,296117,1.0,296117.0,296117,1.196198e-05,3.542146
python,Normalized Levenshtein,secondarycause,113666,1.0,113666.0,113666,1.304353e-05,1.482606
rust,Jaro-Winkler,primary_combined,276121,0.0,0.0,276121,4.495894e-07,0.124141
rust,Jaro-Winkler,secondarycause,113808,0.0,0.0,113808,4.695087e-07,0.053434


In [15]:
(groups['time']['mean'] * groups['time']['count']).sort_values()

language  metric                  level           
go        Jaro-Winkler            secondarycause      0.047042
rust      Jaro-Winkler            secondarycause      0.053434
go        Normalized Levenshtein  secondarycause      0.066961
          Jaro-Winkler            primary_combined    0.113741
rust      Jaro-Winkler            primary_combined    0.124141
          Normalized Levenshtein  secondarycause      0.151870
go        Normalized Levenshtein  primary_combined    0.158694
rust      Normalized Levenshtein  primary_combined    0.359507
python    Jaro-Winkler            secondarycause      0.574139
                                  primary_combined    1.432133
          Normalized Levenshtein  secondarycause      1.482606
                                  primary_combined    3.542146
dtype: float64

In [11]:
df.distance.describe().round(3)

count    1437111.000
mean           0.285
std            0.451
min            0.000
25%            0.000
50%            0.000
75%            1.000
max            1.000
Name: distance, dtype: float64

In [16]:
df.time.describe().round(3)

count    1437111.000
mean           0.000
std            0.000
min            0.000
25%            0.000
50%            0.000
75%            0.000
max            0.017
Name: time, dtype: float64