/
tablevals.py
79 lines (70 loc) · 2.44 KB
/
tablevals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
from corpus import filtered_corpus
from gensim import corpora, models, similarities
import nltk
from collections import Counter
import re
def preprocess_tables():
stopwords = nltk.corpus.stopwords.words('english')
for train, topic, title, text in filtered_corpus():
buf = []
for line in text.split('\n'):
buf.append(line)
print line
table_str = ' '.join(buf)
float_buf = []
for i in table_str.split(' '):
if i == 'billion':
if len(float_buf) > 0:
float_buf[-1] *= 1.0E9
continue
if i == 'mln' or i == 'million':
if len(float_buf) > 0:
float_buf[-1] *= 1.0E6
continue
cur = []
for j in i:
if j >= '0' and j <= '9':
cur.append(j)
if j == '.':
if '.' in cur:
cur = []
break
cur.append(j)
if len([c for c in cur if c != '.']) == 0:
continue
cur = float(''.join(cur))
float_buf.append(cur)
table_str = table_str.lower()
yield train, topic, float_buf
def export_to_arff(mode, output_path):
assert mode in ["TRAIN", "TEST"]
# Get table items
corpus = [(topic, table) for train, topic, table in preprocess_tables() if train == mode]
# Create some ARFF atributes
attributes = set(['value_prev','value_1'])
topics = set([])
for topic, table in corpus:
topics.add(topic)
attributes = sorted(attributes)
output_f = open(output_path, 'w')
print >> output_f, '@relation "tablevals"'
for f in attributes:
print >> output_f, '@attribute %s numeric' % (f,)
print >> output_f, '@attribute topicClass {%s}' % (','.join(topics),)
print >> output_f, '@data'
for topic, title in corpus:
value_prev = None
for entry in title:
if entry >= 1E12:
continue
if value_prev is None:
print >> output_f, '?,',
else:
print >> output_f, value_prev, ',',
print >> output_f, ','.join([str(entry), topic])
value_prev = entry
output_f.close()
if __name__ == "__main__":
export_to_arff("TRAIN", "tablevals_train.arff")
export_to_arff("TEST", "tablevals_test.arff")