-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
85 lines (72 loc) · 2.08 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pickle as pkl
from os import listdir, path, makedirs
import csv
def get_filenames(dir_):
return listdir(dir_)
def get_filepaths(dir_):
return ['{}/{}'.format(dir_, f) for f in get_filenames(dir_)]
def create_dir_if_not_exist(path_):
if not path.exists(path_):
makedirs(path_)
def load_picklefile(path_):
with open(path_, 'rb') as f:
return pkl.load(f)
def dump_picklefile(object_, path_):
with open(path_, 'wb') as f:
pkl.dump(object_, f, protocol = 3)
return True
def load_textfile(path_):
with open(path_, 'r') as f:
lines = f.readlines()
return [l.strip() for l in lines]
def load_csvfile(path_, delim = ','):
lines = []
with open(path_) as csvfile:
csvfile = csv.reader(csvfile, delimiter=delim)
for row in csvfile:
lines.append(row)
return lines
def dump_textfile(data, path_):
with open(path_, 'w') as f:
for item in data:
if item is not None:
f.write(item+'\n')
return True
def load_precomputed_embeddings(dir_, low_memory_mode = False):
filepaths = get_filepaths(dir_)
precomputed_embeddings = []
for path_ in filepaths:
print('loading {}'.format(path_))
sentence_embeddings = load_picklefile(path_)
precomputed_embeddings.extend(sentence_embeddings)
if low_memory_mode and len(precomputed_embeddings) > 100000:
print('low memory mode is active.')
print('a smaller subset of the embeddings are returned inconsideration for memory usage')
return precomputed_embeddings
return precomputed_embeddings
def sentence_contains_term(sentence, terms):
for t in terms:
if t in sentence:
return 1
return 0
def replace_words_in_sentence(s, words, replacement_token = 'UNK', emoticons = False):
if emoticons:
for w in words:
s = s.replace(w, replacement_token)
return s
else:
for w in words:
s = s.replace(' '+w+' ', ' {} '.format(replacement_token))
if s.startswith(w+' '):
idx = len(w) + 1
s = '{} {}'.format(replacement_token, s[idx:])
return s
def csv_to_dict(csv_):
'''
first column is used as the key
the remaining columns are used as the value
'''
dct = {}
for row in csv_:
dct[row[0]] = row[1:]
return dct