forked from proycon/valkuil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
valkuilharvestscore.py
executable file
·98 lines (82 loc) · 2.64 KB
/
valkuilharvestscore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#! /usr/bin/env python
# -*- coding: utf8 -*-
import sys
import codecs
from collections import defaultdict
total = defaultdict(int)
byannotator = {}
byclass = {}
begindate = None
beginline = 0
taillines = 0
try:
if int(sys.argv[2]) < 0:
taillines = int(sys.argv[2])
else:
begindate = sys.argv[2]
except:
pass
if taillines < 0:
linecount = 0
f = open(sys.argv[1],'r')
for line in f:
linecount += 1
f.close()
beginline = linecount + taillines
f = codecs.open(sys.argv[1],'r','utf-8')
for i,line in enumerate(f):
fields = line.split(' ')
if len(fields) != 12 and len(fields) != 15:
print >>sys.stderr,"Skipping line " + str(i+1) + ", old format..."#, len(fields)
continue
else:
print >>sys.stderr,"Processing " + str(i+1)
if begindate:
if len(fields) == 15:
if fields[0] < begindate:
print >>sys.stderr,"Skipping line " + str(i+1) + ", prior to date threshold..."#, len(fields)
continue
else:
print >>sys.stderr,"Skipping line " + str(i+1) + ", old format, no timestamp..."#, len(fields)
continue
elif beginline:
if (i < beginline):
print >>sys.stderr,"Skipping line " + str(i+1) + "..."#, len(fields)
continue
mode = fields[-5]
cls = fields[-4]
correction = fields[-1]
annotator = fields[-3]
if annotator and len(annotator) < 30:
if not annotator in byannotator:
byannotator[annotator] = defaultdict(int)
byannotator[annotator][mode] += 1
if cls:
if not cls in byclass:
byclass[cls] = defaultdict(int)
byclass[cls][mode] += 1
total[mode] += 1
print "TOTALS\n---------------------\n"
for key, value in total.items():
print key + ':\t' + str(value)
try:
print 'Accepted Ratio:\t' + str(total['accepted-correction'] / float(total['discarded'] + total['accepted-correction']))
except:
pass
#print "\nBY MODULE\n---------------------\n"
#for annotator in sorted(byannotator):
# print annotator
# for key, value in byannotator[annotator].items():
# print key + ':\t' + str(value)
# print
print "\nBY CLASS\n---------------------\n"
for cls in sorted(byclass):
print cls
for key, value in byclass[cls].items():
print key + ':\t' + str(value)
try:
print 'Accepted Ratio:\t' + str(byclass[cls]['accepted-correction'] / float(byclass[cls]['discarded'] + byclass[cls]['accepted-correction']))
except:
pass
print
f.close()