In [39]:
from pymongo import MongoClient
from nltk import edit_distance
import json

In [40]:
def parse_header(path):
    '''
    Takes in a file path for the header data to be parsed
    Returns a list of dictionary objects of header data
    '''
    res = []
    with open(path) as file:
        lines = file.readlines()
        for line in lines:
            data = line.split(',')
            entry = {
                'company': data[0].strip('"'),
                'image_id': int(data[1]),
                'x1': float(data[2]),
                'x2': float(data[3]),
                'y1': float(data[4]),
                'y2': float(data[5])
            }
            res.append(entry)
    return res

In [41]:
headers = parse_header("GoldStandard1930s.20200923.csv")

In [50]:
# with open('chace_output.json', 'r') as file:
#     chace_data = json.load(file)

with open('1930.json', 'r') as file:
    chace_data = json.load(file)

In [51]:
print(str(headers[0]['image_id'])[:8]+'-'+str(headers[0]['image_id'])[-4:])

19300006-0001


In [52]:
data = {}
for d in chace_data:
    data[d['_id'].replace('-', '')] = [c.strip() for c in d['companies']]

In [53]:
print(len(data))
print(len(chace_data))
print(len(headers))

3635
3635
52640


In [54]:
years = {
    1930:{
        'correct': 0,
        'total': 0
    },
    1931:{
        'correct': 0,
        'total': 0
    },
    1932:{
        'correct': 0,
        'total': 0
    },
    1933:{
        'correct': 0,
        'total': 0
    },
    1934:{
        'correct': 0,
        'total': 0
    },
    1935:{
        'correct': 0,
        'total': 0
    },
    1936:{
        'correct': 0,
        'total': 0
    },
    1937:{
        'correct': 0,
        'total': 0
    },
    1938:{
        'correct': 0,
        'total': 0
    },
    1939:{
        'correct': 0,
        'total': 0
    },
}

In [55]:
missed = []
count = 0
error = 0
for h in headers:
    found = False
    id_str = str(h['image_id'])
    year = int(id_str[:4])
    years[year]['total'] += 1
    companies = data.get(id_str, None)
    if companies:
        if h['company'] in companies:
            count += 1
            years[year]['correct'] += 1
            found = True
        else:
            for c in companies:
                if edit_distance(h['company'], c) < 0.25*len(h['company']):
                    count += 1
                    years[year]['correct'] += 1
                    found = True
                    break
    else:
        error += 1

    if not found:
        if companies:
            missed.append([str(h['image_id']), h['company'], '  '.join(companies)])
        else:
            missed.append([str(h['image_id']), h['company'], 'None'])
print(count)
print(error)

3385
47680


In [56]:
print(count/len(headers))

0.0643047112462006


In [57]:
for y in years:
    print('{}: {}'.format(y, years[y]['correct']/years[y]['total']))

1930: 0.5626662234042553
1931: 0.0
1932: 0.0
1933: 0.0
1934: 0.0
1935: 0.0
1936: 0.0
1937: 0.0
1938: 0.0
1939: 0.0


In [27]:
print(missed[:100])

[['193000060001', 'AMERICAN WINDOW GLASS CO', 'None'], ['193000060004', 'BARTLETT CO', 'None'], ['193000060004', 'PITTSBURGH STEEL CO', 'None'], ['193000060006', 'SHUBERT THEATRE CORP', 'None'], ['193000060007', 'SOUTHERN ICE AND UTILITIES CO', 'None'], ['193000060009', 'A G SPALDING BROS', 'None'], ['193000060011', 'ACME STEEL CO', 'None'], ['193000060012', 'AMERICAN BOSCH MAGNETO CORP', 'None'], ['193000060013', 'AMERICAN BROWN BOVERI ELECTRIC CORP', 'CHICOPEE REALTY CORP'], ['193000060015', 'AMERICAN CHICLE CO', 'None'], ['193000060016', 'AMERICAN HIDE LEATHER CO', 'None'], ['193000060017', 'AMERICAN LA FRANCE FOAMITE CORP', 'None'], ['193000060018', 'THE AMERICAN LAUNDRY MACHINERY CO', 'None'], ['193000060019', 'AMERICAN LOCOMOTIVE CO', 'None'], ['193000060021', 'MCINTOSH SEYMOUR CORP', 'THE AMERICAN METAL COMPANY LTD'], ['193000060023', 'SAN TOY MINING CO', 'None'], ['193000060024', 'THE AMERICAN REPUBLICS CORP', 'None'], ['193000060025', 'PENNSYLVANIA CAR CO', 'None'], ['19300006

In [13]:
with open('missed_headers.csv', 'w') as file:
    for m in missed:
        file.write(','.join(m) + '\n')