### Hypernymy discovery using projection learning

SemEval2018-Task9 dataset

In [1]:
# 1. Load candidates and queries
# queries: data |hypos; candidates: vocabulary |hypers (total)

from preprocess import *
se = Dataset()
candidates, queries = se.load_vocab('./data/SemEval2018-Task9', lower_queries=True)


# ```````````````````````````````````````````````````````````````
# Check
print("# of candidates: {}".format(len(candidates)))
print("# of queries: {}".format(len(queries)))

vocab = candidates.union(queries)
print("\nSize of vocab: {}".format(len(vocab)))

unigrams, bigrams, trigrams = is_ngram(vocab)
print("# of unigrams: {}".format(len(unigrams)))
print("# of bigrams: {}".format(len(bigrams)))
print("# of trigrams: {}".format(len(trigrams)))
# ```````````````````````````````````````````````````````````````


# 2. Preprocess the corpus
# 2.1 Count n-gram frequencies in corpus
term_to_freq_in = n_gram_count("./data/2B_music_bioreviews_tokenized.txt", vocab)

nb_missing_q = sum(1 for w in queries if term_to_freq_in[w] == 0)
nb_missing_c = sum(1 for w in candidates if term_to_freq_in[w] == 0)
print("# of zero-frequency queries: {}".format(nb_missing_q))
print("# of zero-frequency candidates: {}".format(nb_missing_c))


Loading candidates from vocabulary ...
Loading queries from './data/SemEval2018-Task9/vocabulary/2B.music.vocabulary.txt' ...
Loading queries from './data/SemEval2018-Task9/training/data/2B.music.training.data.txt' ...
Loading queries from './data/SemEval2018-Task9/trial/data/2B.music.trial.data.txt' ...
# of candidates: 69118
# of queries: 1014

Size of vocab: 69336
# of unigrams: 58394
# of bigrams: 9749
# of trigrams: 1193
Counting lines in corpus...


3481it [00:00, 34599.02it/s]

Counting n-gram frequencies in corpus...


4298453it [02:54, 24681.47it/s]

4298453/4298453 lines processed. Vocab coverage: 69336/69336.
# of zero-frequency queries: 0
# of zero-frequency candidates: 0





**preprocess the corpus**

In [2]:
# 2.2 Replace multi-word terms with single tokens and write into files
# n-gram --> sigle token
replace_OOV = True

corpus = "./data/2B_music_bioreviews_tokenized.txt"
preprocessed = "./f_out_preprocessed_corpus.txt"
term_to_freq_out = preprocess_corpus(corpus, preprocessed, queries, candidates, term_to_freq_in, replace_OOV)

1571it [00:00, 15616.86it/s]

Processing corpus...


4298453it [03:15, 22013.99it/s]

# of missing queries in output: 0
# of missing candidates in output: 21
Examples: amyotrophic, australian rules, electroconvulsive, esophageal, giardiniera, gluteus, holies, hyoscyamus, impinge, infarction, inkjet, longlegs, matrix printer, maundy, pancreatic, propping, reductio, rheumatoid, signal-to-noise, strep, whole shebang
Finished...

Wrote corpus to './f_out_preprocessed_corpus.txt'





In [3]:
# 2.3 Write frequencies
vocab_out_name = "f_out"
write_freq(vocab_out_name, term_to_freq_out)

Wrote vocab to './f_out.vocab.txt'


**train word embeddings on corpus using word2vec**

In [7]:
# 3. Train embeddings
from embed import *

preprocessed_corpus_path = "./f_out_preprocessed_corpus.txt"
save_embed_path = "./replace_OOV_" + str(replace_OOV) + "_word2vec_model"
#-cbow 0 -negative 10 -size 200 -window 7 -sample 1e-5 -min-count 1 -iter 10 -threads 8 -binary 0
train_embed(preprocessed_corpus_path, save_embed_path, size=200, window=7, min_count=1, workers=3, sg=0, negative=10, sample=1e-5)


In [8]:
# load embedding
embeddings = load_embed(save_embed_path)

In [8]:
# try
# i. Query for the word embeddings of a given word
print('Embedding for art_rock: ',embeddings.wv['art_rock'])
print('Embedding for art_rock: ',embeddings.wv['art_rock'])

# ii. Inspect the model vocabulary
# print the first 20 words
print('\nFirst 20 vocabulary words:',list(embeddings.wv.vocab)[:20])
print('\nVocabulary size:',len(list(embeddings.wv.vocab)))

Embedding for art_rock:  [-0.23230837  0.01129624  0.13712774  0.2948994   0.32275087  0.10951022
 -1.2290376  -0.7920031   0.80765796 -0.5433384  -0.31529322  0.22816107
  0.06184823 -0.37818426  0.22129554 -0.4265703   0.6036683  -0.7077877
 -0.03365989  0.40926868 -0.35232663  0.3912602   0.42833325 -0.6010476
 -0.4191413  -0.87336016 -0.00455265  0.9714704  -0.11817481  1.188886
 -1.0646467   0.9814277   1.0018854   0.38089174  0.7176756   0.00531878
 -0.61265355 -0.3516089   0.14747235 -0.4422404   0.11448751 -0.3273031
 -1.0525167  -0.6365699   0.10880286 -1.2576685  -0.10639022  1.1753112
  0.8095967  -0.05291764  0.16562843  0.2761667   0.05437643 -0.1625104
 -0.5321007   0.62873125 -0.00801373 -0.53174585 -0.27858248  0.36114895
 -1.2442654  -0.08463544 -0.42626226 -1.0270053   0.7380261   0.5169968
 -0.45904094  0.4419561  -1.3959762   1.1092414   0.039956    0.2952897
 -0.38187805  0.02782012 -0.20733292  0.9550967   0.43262872 -0.865123
  0.15561125  0.1213872  -0.17276989 

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  if __name__ == '__main__':


In [9]:
# 4. vocabulary (hypo-hyper pairs) to embeddings
write_embed_path = "./embeddings.txt"
write_embed(save_embed_path, write_embed_path)

**preprocess data**

In [10]:
# 5. Load data
# candidates
print("# of candidates: {}".format(len(candidates)))
# Load queries
print("\nLoading queries...")
path_q_train = "{}/training/data/{}.training.data.txt".format("./data/SemEval2018-Task9", "2B.music")
path_q_dev = "{}/trial/data/{}.trial.data.txt".format("./data/SemEval2018-Task9", "2B.music")
path_q_test = "{}/test/data/{}.test.data.txt".format("./data/SemEval2018-Task9", "2B.music")
q_train, _ = se.load_queries(path_q_train, normalize=True)
q_dev, _ = se.load_queries(path_q_dev, normalize=True)
q_test, _ = se.load_queries(path_q_test, normalize=True)
print("# of training queries: {}".format(len(q_train)))
print("# of dev queries: {}".format(len(q_dev)))
print("# of test queries: {}".format(len(q_test)))

# of candidates: 69118

Loading queries...
# of training queries: 500
# of dev queries: 15
# of test queries: 500


In [11]:
# Load gold hypernyms (train and dev only)
print("Loading gold hypernyms...")
path_h_train = "{}/training/gold/{}.training.gold.txt".format("./data/SemEval2018-Task9", "2B.music")
path_h_dev = "{}/trial/gold/{}.trial.gold.txt".format("./data/SemEval2018-Task9", "2B.music")
h_train = se.load_hypernyms(path_h_train)
h_dev = se.load_hypernyms(path_h_dev, normalize=True)
print("# of training pairs: {}".format(sum(len(x) for x in h_train)))
print("# of dev pairs: {}".format(sum(len(x) for x in h_dev)))

Loading gold hypernyms...
# of training pairs: 5455
# of dev pairs: 355


In [12]:
print("Loading pre-trained word embeddings...")
embed_vocab_list, word2vec = get_embeddings("./embeddings.txt", np.float32)
embed_vocab_set = set(embed_vocab_list)
print("# of embeddings: {}".format(len(embed_vocab_list)))

Loading pre-trained word embeddings...
# of embeddings: 69316


In [13]:
# 6. Preparing embeddings for data
print("Making embedding array for candidates...")
candidate_embeds = make_embedding_matrix(word2vec, candidates, seed=91500)
candidate_embeds = normalize_numpy_matrix(candidate_embeds)
print("# of embeddings: {}".format(candidate_embeds.shape[0]))
print("Making embedding array for training queries...")
train_query_embeds = make_embedding_matrix(word2vec, q_train, seed=91500)
train_query_embeds = normalize_numpy_matrix(train_query_embeds)
print("# of embeddings: {}".format(train_query_embeds.shape[0]))
print("Making embedding array for dev queries...")
dev_query_embeds = make_embedding_matrix(word2vec, q_dev, seed=91500)
dev_query_embeds = normalize_numpy_matrix(dev_query_embeds)
print("# of embeddings: {}".format(dev_query_embeds.shape[0]))
print("Making embedding array for test queries...")
test_query_embeds = make_embedding_matrix(word2vec, q_test, seed=91500)
test_query_embeds = normalize_numpy_matrix(test_query_embeds)
print("# of embeddings: {}".format(test_query_embeds.shape[0]))

Making embedding array for candidates...
# of embeddings: 69118
Making embedding array for training queries...
# of embeddings: 500
Making embedding array for dev queries...
# of embeddings: 15
Making embedding array for test queries...
# of embeddings: 500


In [14]:
print("# of embed vocabulary: ",len(embed_vocab_set))
# print(candidates)  #--vocabulary
# print(h_train)  #--gold
cand_in_embed_vocab=[x for x in candidates if x in embed_vocab_set]
flat_h_train=[x for list in h_train for x in list]
flat_h_dev=[x for list in h_dev for x in list]
h_train_in_cand=[x for x in flat_h_train if x in candidates]  #--gold in vocabulary
h_dev_in_cand=[x for x in flat_h_dev if x in candidates]  #--gold in vocabulary
print("# of candidates: ",len(candidates))
print("# of cand in embed vocab: ",len(cand_in_embed_vocab))
print("-- Assign a random vector to words that don't have one. --")
print("# of h_train: ",len(flat_h_train))
print("# of h_dev: ",len(flat_h_dev))
print("# of h_train in cand: ",len(h_train_in_cand))
print("# of h_dev in cand: ",len(h_dev_in_cand))
# why some candidate not in embed vocab?
# why some gold not in candidate?

# of embed vocabulary:  69316
# of candidates:  69118
# of cand in embed vocab:  58327
-- Assign a random vector to words that don't have one. --
# of h_train:  5455
# of h_dev:  355
# of h_train in cand:  2651
# of h_dev in cand:  231


In [15]:
# Make array of (query IDs, hypernym IDs) pairs for model training
print("Making array of (query ID, hypernym ID) pairs...")
candidate_to_id = {w:i for i,w in enumerate(candidates)}
train_query_to_id = {w:i for i,w in enumerate(q_train)}
dev_query_to_id = {w:i for i,w in enumerate(q_dev)}

# train_pairs = make_pairs(q_train, h_train, train_query_to_id, candidate_to_id)
# dev_pairs = make_pairs(q_dev, h_dev, dev_query_to_id, candidate_to_id)
###### temporarily use hypers already in candidates
train_pairs = make_pairs(q_train, h_train_in_cand, train_query_to_id, candidate_to_id)
dev_pairs = make_pairs(q_dev, h_dev_in_cand, dev_query_to_id, candidate_to_id)

print("# of train pairs: {}".format(train_pairs.shape[0]))
print("# of dev pairs: {}".format(dev_pairs.shape[0]))
    
# Check for queries that are also candidates. Make list of query
# candidate IDs (None for queries that are not candidates)
train_q_cand_ids = [candidate_to_id[q] if q in candidate_to_id else None for q in q_train]
dev_q_cand_ids = [candidate_to_id[q] if q in candidate_to_id else None for q in q_dev]
test_q_cand_ids = [candidate_to_id[q] if q in candidate_to_id else None for q in q_test]
nb_cand_q_train = sum(1 for i in train_q_cand_ids if i is not None)
nb_cand_q_dev = sum(1 for i in dev_q_cand_ids if i is not None)
nb_cand_q_test = sum(1 for i in test_q_cand_ids if i is not None)
print("# of training queries that are also candidates: {}".format(nb_cand_q_train))
print("# of dev queries that are also candidates: {}".format(nb_cand_q_dev))
print("# of test queries that are also candidates: {}".format(nb_cand_q_test))

Making array of (query ID, hypernym ID) pairs...
# of train pairs: 500
# of dev pairs: 15
# of training queries that are also candidates: 248
# of dev queries that are also candidates: 7
# of test queries that are also candidates: 250


In [16]:
# 7. Pickle and dump data
import joblib
    
data = {}
data["candidates"] = candidates
data["candidate_embeds"] = candidate_embeds
data["train_queries"] = q_train
data["train_query_embeds"] = train_query_embeds
data["train_query_cand_ids"] = train_q_cand_ids
data["dev_queries"] = q_dev
data["dev_query_embeds"] = dev_query_embeds
data["dev_query_cand_ids"] = dev_q_cand_ids
data["test_queries"] = q_test
data["test_query_embeds"] = test_query_embeds
data["test_query_cand_ids"] = test_q_cand_ids
data["train_pairs"] = train_pairs
data["dev_pairs"] = dev_pairs
print("\nData:")
for k,v in data.items():
    print("- {} ({}.{})".format(k, type(v).__module__, type(v).__name__))
joblib.dump(data, "./dumped.data")
print("\nWrote data to '{}'\n".format("./dumped.data"))


Data:
- candidates (builtins.set)
- candidate_embeds (numpy.ndarray)
- train_queries (builtins.list)
- train_query_embeds (numpy.ndarray)
- train_query_cand_ids (builtins.list)
- dev_queries (builtins.list)
- dev_query_embeds (numpy.ndarray)
- dev_query_cand_ids (builtins.list)
- test_queries (builtins.list)
- test_query_embeds (numpy.ndarray)
- test_query_cand_ids (builtins.list)
- train_pairs (numpy.ndarray)
- dev_pairs (numpy.ndarray)

Wrote data to './dumped.data'



**Train model on training and dev data in pickle file, write a model and a log file in dir-model**

In [1]:
from train import *

data_path = "./dumped.data"
log_path = "./modLog.txt"
save_filename = "projectMod"
model = train_model(data_path, log_path, 9510, use_gpu = False)
save_model(save_filename, model)

Loading data...
Data:
- candidates (builtins.set)
- candidate_embeds (numpy.ndarray)
- train_queries (builtins.list)
- train_query_embeds (numpy.ndarray)
- train_query_cand_ids (builtins.list)
- dev_queries (builtins.list)
- dev_query_embeds (numpy.ndarray)
- dev_query_cand_ids (builtins.list)
- test_queries (builtins.list)
- test_query_embeds (numpy.ndarray)
- test_query_cand_ids (builtins.list)
- train_pairs (numpy.ndarray)
- dev_pairs (numpy.ndarray)

Initializing model...
Model parameters:
- projector.pmats (on CPU, grad=yes) 
- projector.cand_embed.weight (on CPU, grad=yes) 
- output.weight (on CPU, grad=yes) 
- output.bias (on CPU, grad=yes) 

Evaluating untrained model on dev set...
MAP: 0.0000
AP: 0.0000
MRR: 0.0000

Starting training...

Epoch	Updates	PosLoss	NegLoss	DevLoss	DevMAP	DevAP	DevMRR	TimeElapsed
1	16	0.6992	7.0832	0.6754	0.0000	0.0000	0.0000	8.3s
2	17	0.7179	6.5686	0.7035	0.0000	0.0000	0.0000	15.9s
3	16	0.7436	6.1078	0.7308	0.0000	0.0000	0.0000	21.4s
4	15	0.7531	5.6

135	17	0.0534	1.0342	1.6502	0.0000	0.0000	0.0000	889.1s
136	16	0.0601	1.0182	1.6485	0.0000	0.0000	0.0000	896.9s
137	16	0.0414	1.0311	1.6488	0.0000	0.0000	0.0000	904.5s
138	17	0.0470	0.9462	1.6495	0.0000	0.0000	0.0000	912.0s
139	18	0.0381	1.0575	1.6508	0.0000	0.0000	0.0000	919.7s
140	16	0.0375	1.0448	1.6503	0.0000	0.0000	0.0000	926.8s
141	15	0.0432	1.0302	1.6521	0.0000	0.0000	0.0000	933.5s
142	17	0.0364	1.0478	1.6517	0.0000	0.0000	0.0000	940.9s
143	16	0.0342	1.0406	1.6504	0.0000	0.0000	0.0000	948.0s
144	16	0.0370	1.0710	1.6499	0.0000	0.0000	0.0000	955.4s
145	17	0.0435	0.9777	1.6498	0.0000	0.0000	0.0000	963.2s
146	16	0.0371	0.9517	1.6546	0.0000	0.0000	0.0000	970.6s
147	15	0.0328	0.9573	1.6583	0.0000	0.0000	0.0000	977.9s
148	16	0.0338	1.0063	1.6589	0.0000	0.0000	0.0000	985.2s
149	17	0.0314	0.9072	1.6620	0.0000	0.0000	0.0000	992.8s
150	15	0.0311	0.9923	1.6634	0.0000	0.0000	0.0000	1000.4s
151	15	0.0331	1.1654	1.6612	0.0000	0.0000	0.0000	1007.3s
152	17	0.0260	1.1328	1.6571	0.0000	0.0000	0.00

**Load trained model, make predictions on test queries**

In [25]:
# Load model
print("Loading model from {}".format("./projectMod.pt"))
model = torch.load("./projectMod.pt")
model_vocab_size = model.get_nb_candidates()
print("Size of model's vocab (nb_candidates): {}".format(model_vocab_size))

# Load data
print("Loading test data from {}".format("./dumped.data"))
data = joblib.load("./dumped.data")
candidates = data["candidates"]
test_q_cand_ids = data["test_query_cand_ids"]
test_q_embed = make_embedder(data["test_query_embeds"], grad=False, 
                                 cuda=model.use_cuda, sparse=False)

# Make list of test query IDs
print("# of test queries: {}".format(test_q_embed.weight.shape[0]))

# Write predictions on test set
print("Writing predictions on test set to '{}'".format("./Pred_on_test.txt"))
test_eval = Evaluator(model, test_q_embed, test_q_cand_ids)
test_eval.write_predictions("./Pred_on_test.txt", list(candidates))

print("Done.\n")

Loading model from ./projectMod.pt
Size of model's vocab (nb_candidates): 69118
Loading test data from ./dumped.data
# of test queries: 500
Writing predictions on test set to './Pred_on_test.txt'
Done.

