/
pre709.py
56 lines (48 loc) · 1.7 KB
/
pre709.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from corpus import filtered_corpus
from gensim import corpora, models, similarities
import nltk
from collections import Counter
import re
import csv
def preprocess_docs():
stopwords = nltk.corpus.stopwords.words('english')
for train, topic, title, text in filtered_corpus():
text = [i for i in nltk.word_tokenize(text) if i not in stopwords]
yield train, topic, text
def export_to_arff(mode, output_path):
assert mode in ["TRAIN", "TEST"]
# Read preprocessed titles
corpus = [(topic, text) for train, topic, text in preprocess_docs() if train == mode]
# Create some ARFF atributes
attributes = set([])
for topic, text in corpus:
for word in text:
regexp = re.compile(r'[^a-zA-Z]')
if regexp.search(word) is not None:
continue
attributes.add(word)
topics = set([])
for topic, text in corpus:
topics.add(topic)
attributes = sorted(attributes)
output_f = open(output_path, 'w')
print >> output_f, '@relation "docs"'
for f in attributes:
print >> output_f, '@attribute %s numeric' % (f,)
print >> output_f, '@attribute topicClass {%s}' % (','.join(topics),)
print >> output_f, '@data'
for topic, text in corpus:
row = dict(((u, 0) for u in attributes))
for word in text:
if word not in row:
continue
row[word] += 1
buf = []
for attr in attributes:
buf.append(row[attr])
buf.append(topic)
print >> output_f, ','.join(map(str,buf))
output_f.close()
if __name__ == "__main__":
export_to_arff("TRAIN", "PRE709_train.arff")
export_to_arff("TEST", "PRE709_test.arff")