In [1]:
import pandas as pd
from pathlib import Path
import json

In [2]:
# load data, set language field using file name
data = []
for p in Path('../data/third-party').iterdir():
    lang = p.name.split("-")[0]
    with open(p, 'r') as file:
        for line in file:
            line_data = json.loads(line)
            line_data['language'] = lang
            data.append(line_data)

# remove unneeded quotes from some fields
for record in data:
    record['casenumber'] = record['casenumber'].strip("\"")
    for result in record['results']:
        result['word'] = result['word'].strip("\"")

data[:2]

[{'casenumber': 'ME2021-10816',
  'results': [{'distance': 0,
    'level': 'primary_combined',
    'metric': 'Normalized Levenshtein',
    'time': 4.625e-06,
    'word': ''}],
  'language': 'go'},
 {'casenumber': 'ME2021-10815',
  'results': [{'distance': 0,
    'level': 'primary_combined',
    'metric': 'Normalized Levenshtein',
    'time': 9.17e-07,
    'word': ''}],
  'language': 'go'}]

In [3]:
table = []
for case in data:
    for result in case['results']:
        result['casenumber'] = case['casenumber']
        result['language'] = case['language']
        table.append(result)

table[:2]

[{'distance': 0,
  'level': 'primary_combined',
  'metric': 'Normalized Levenshtein',
  'time': 4.625e-06,
  'word': '',
  'casenumber': 'ME2021-10816',
  'language': 'go'},
 {'distance': 0,
  'level': 'primary_combined',
  'metric': 'Normalized Levenshtein',
  'time': 9.17e-07,
  'word': '',
  'casenumber': 'ME2021-10815',
  'language': 'go'}]

In [4]:
df = pd.DataFrame(table)
df.distance = df.distance.astype(float)
df.time = df.time.astype(float)
df.shape

(2381108, 7)

In [5]:
df[df.distance > 0].shape

(409783, 7)

In [6]:
# 17% of records have distance to heroin greater than 0
df[df.distance > 0].shape[0] / df.shape[0]

0.17209761170009927

In [7]:
# python > go > rust in capturing records somehow
# shouldn't we all have X number of words and thus X matches?
df.language.value_counts()

python    819566
go        781684
rust      779858
Name: language, dtype: int64

In [10]:
# rust takes longest
# python has higher average distance?
# jaro generally faster than levenshtein
df.groupby(['language', 'metric', 'level']).mean().round(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,distance,time
language,metric,level,Unnamed: 3_level_1,Unnamed: 4_level_1
go,Jaro-Winkler,primary_combined,0.0,0.0
go,Jaro-Winkler,secondarycause,0.0,0.0
go,Normalized Levenshtein,primary_combined,0.0,0.0
go,Normalized Levenshtein,secondarycause,0.0,0.0
python,Jaro-Winkler,primary_combined,0.0,0.0
python,Jaro-Winkler,secondarycause,0.0,1e-05
python,Normalized Levenshtein,primary_combined,1.0,1e-05
python,Normalized Levenshtein,secondarycause,1.0,1e-05
rust,Jaro-Winkler,primary_combined,0.0,0.0
rust,Jaro-Winkler,secondarycause,0.0,0.0


In [11]:
df.distance.describe().round(3)

count    1437111.000
mean           0.285
std            0.451
min            0.000
25%            0.000
50%            0.000
75%            1.000
max            1.000
Name: distance, dtype: float64

In [16]:
df.time.describe().round(3)

count    1437111.000
mean           0.000
std            0.000
min            0.000
25%            0.000
50%            0.000
75%            0.000
max            0.017
Name: time, dtype: float64