-
Notifications
You must be signed in to change notification settings - Fork 0
/
Utility.py
114 lines (91 loc) · 4.5 KB
/
Utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*
import re
import codecs
import requests
import json
from sklearn import svm, cross_validation
from nltk.tokenize import wordpunct_tokenize
from Reader import TrainingFileReader
def tokenize_file(old_file_path, new_file_path):
with open(new_file_path, mode='w', encoding='latin-1') as writee:
with open(old_file_path, mode='r', encoding='latin-1') as corpus:
for line in corpus:
line_tokens = wordpunct_tokenize(line)
new_line = ' '.join(line_tokens)
writee.write(new_line + '\n')
def extract_partial_file(filepath, ratio, new_file_path, other_file_path):
stored = 0
ignored = 0
total = -1
with open(filepath, mode='r', encoding='latin-1') as read:
with open(new_file_path, mode='w', encoding='latin-1') as training_set:
with open(other_file_path, mode='w', encoding='latin-1') as test_set:
for line in read:
if float(stored)/total < ratio:
training_set.write(line)
stored += 1
else:
test_set.write(line)
ignored += 1
total = stored + ignored
def clean_wiki_corpus(old_file_path, new_file_path):
with open(new_file_path, mode='w') as writee:
with open(old_file_path, mode='r', encoding='latin-1') as corpus:
keep_killing = False
for line in corpus:
if re.search(r"<doc.*>", line) or re.search(r"</doc.*>", line):
pass
elif re.search(r"ENDOFARTICLE\.", line):
keep_killing = False
elif keep_killing:
pass
elif re.search(r" Fuentes \.", line)\
or re.search(r" Referencias \.", line)\
or re.search(r" Bibliografía\ \.", line)\
or re.search(r" Enlaces externos \.", line)\
or re.search(r" Véase también \.", line)\
or re.search(r" Referencias y enlaces externos \.", line)\
or re.search(r" Galería \.", line)\
or re.search(r" Referencias y enlaces externos \.", line):
keep_killing = True
pass
elif not line.strip():
pass
else:
writee.write(line)
def separate_file_into_two(file_path, mt_file_name, hmn_file_name):
with open(file_path, mode='r', encoding='latin-1') as file:
with open(mt_file_name, mode='w', encoding='latin-1') as zerofile:
with open(hmn_file_name, mode='w', encoding='latin-1') as onefile:
for line in file:
if line.strip():
if line[0] == '0':
zerofile.write(line[2:])
elif line[0] == '1':
onefile.write(line[2:])
else:
print("Asneira")
def tag_corpus(file_name, host_address, props):
sentences = TrainingFileReader.load_training_file(file_name)
#sentences = [{u'sentence':u'We Are Wow ya ha dado en el blanco de asociacines de contenidos con los clubes de fútbol en la Reino Unido , Portugal y los Países Bajos , entre ellos el Chelsea a FC , Everton FC y el SL Benfica .'}]
headers={'Content-Type': 'application/json; charset=UTF-8'}
with open("tagged_corpus.txt", "w") as pos_corpus:
for sentence in sentences:
#props['annotators'] = 'tokenize, ssplit, pos'
sent = sentence['sentence']
r = requests.post(host_address, params=props, headers=headers, data=sent.encode('UTF-8'))
json_response = json.loads(r.text, strict=False)
pos_sent = []
for json_sent in json_response['sentences']:
for token in json_sent['tokens']:
pos_sent.append(token['pos'])
pos_sent = " ".join(pos_sent)
pos_corpus.write(str(sentence['classification']) + " " + pos_sent + '\n')
if __name__ == '__main__':
"""
with open("core_nlp_spanish.props", mode="r") as props_file:
props = {'properties': props_file.read().replace('\n', '').replace(' ', '')}
print(props)
tag_corpus("training.txt", host_address="http://146.193.224.53:9000/", props=props)
"""
#separate_file_into_two("tagged_corpus.txt", "mt_pos_corpus.txt", "hmn_pos_corpus.txt")