-
Notifications
You must be signed in to change notification settings - Fork 0
/
grid_search.py
168 lines (138 loc) · 5.6 KB
/
grid_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from gensim.models.word2vec import LineSentence
import nltk
from tqdm import tqdm
import os
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
import scipy
import numpy as np
import itertools
import pandas as pd
from sklearn.model_selection import ParameterGrid
import json
class EpochSaver(CallbackAny2Vec):
'''Callback to save model after each epoch.'''
def __init__(self, path_prefix):
self.path_prefix = path_prefix
self.epoch = 0
os.makedirs(self.path_prefix, exist_ok=True)
def on_epoch_end(self, model):
savepath = get_tmpfile(
'{}_epoch{}.model'.format(self.path_prefix, self.epoch)
)
model.save(savepath)
print(
"Epoch saved: {}".format(self.epoch + 1),
"Starting next epoch"
)
self.epoch += 1
def senseEmbeddings(word, dictionary):
'''returns the sense embeddings for a given word'''
word = word.split()
word = "_".join(word) if len(word)>1 else word[0]
senses = []
for w in dictionary:
wordC = "_".join(w.split(":")[0].split("_")[:-1])
if word.lower() == wordC.lower():
senses.append(w)
return senses
def wordSimilarity(w1, w2, dictionary, model):
'''take two words and outputs a score of their similarity'''
w1_senses = senseEmbeddings(w1, dictionary)
w2_senses = senseEmbeddings(w2, dictionary)
score = - 1.0
if len(w1_senses)!=0 and len(w2_senses)!=0:
combinations = itertools.product(w1_senses, w2_senses)
for s1, s2 in combinations:
score = max(score, model.wv.similarity(s1, s2))
return score
class Word2VecModel(BaseEstimator):
'''hyperparamter grid search'''
def __init__(self, window=5, min_count=3, size=400, alpha=0.01, sample = 1e-5, negative=9, epochs = 15):
self.w2v_model = None
self.window = window
self.min_count = min_count
self.size = size
self.alpha = alpha
self.sample = sample
self.epochs = epochs
self.negative = negative
def fit(self, data):
'''model training'''
# Initialize model
self.w2v_model = Word2Vec(size=self.size,
window=self.window,
alpha=self.alpha,
min_count = self.min_count,
workers = 8,
sample = self.sample,
sg = 1)
# Build vocabulary
self.w2v_model.build_vocab(data)
self.w2v_model.save("../resources/word2vec_final_lower_skip.model")
# Train model
self.w2v_model.train(sentences = data,
total_examples=self.w2v_model.corpus_count,
epochs=self.epochs,
callbacks = [EpochSaver("./checkpoints")])
self.w2v_model.wv.save_word2vec_format('../resources/embeddings_final_lower_skip.vec', binary=False)
return self
def score(self, gold):
'''scoring function based on word similarity resulting in a correlation score'''
vocab = self.w2v_model.wv.vocab
dictionary = list(vocab.keys())
gold['cosine'] = gold.apply(lambda row: wordSimilarity(row['Word 1'],
row['Word 2'],
dictionary,
self.w2v_model),axis=1)
correlation_2, p2 = scipy.stats.spearmanr(gold['Human (mean)'], gold['cosine'])
return correlation_2
def GridSearch(hyperparameters, corpora):
'''performs a grid search operation and returns the best hyperparamters based on a correlation score'''
best_score = - 1
gold = pd.read_csv('../data/combined.tab', delimiter = '\t')
grid = ParameterGrid(hyperparameters)
best_grid = None
print("number of combinations: {}\n".format(len(list(grid))))
grids = []
for current_grid in tqdm(grid):
#model
current_model = Word2VecModel(**current_grid)
current_model.fit(corpora)
current_score = current_model.score(gold)
del current_model
#updating and writing
current_grid.update({"correlation:":current_score})
grids.append(current_grid)
print(current_grid)
if current_score > best_score:
print("better: ", current_score)
best_score = current_score
best_grid = current_grid
return best_grid, grids
hyperparameters = {'window' : [4, 5],
'min_count' : [1, 3],
'size' : [100, 300],
'negative':[13],
'alpha' : [0.09, 0.001],
'epochs': [50]}
final_hyperparameters = {'window' : [5],
'min_count' : [3],
'size' : [100],
'negative':[13],
'alpha' : [0.09],
'epochs': [50]}
if __name__ == '__main__':
path_to_corpus = '../resources/parsed_corpora_final_lower.txt'
#load
sentences = LineSentence(path_to_corpus)
#search and train
best_grid, grids = GridSearch(final_hyperparameters, sentences)
#saving, comment for final hyperparamters
# with open("../resources/gridsearch7.json", 'w') as f1:
# for grid in grids:
# json.dump(grid, f1)
print("GridSearch complete: ", best_grid)