# Basic imports

In [1]:
import gensim
import numpy as np
from collections import defaultdict
import operator
import json
from chemdataextractor.doc import Paragraph
MODEL_FILE_NAME = '/path/to/model/file'
full_model = gensim.models.Word2Vec.load(MODEL_FILE_NAME)

In [2]:
len(full_model.wv.vocab.keys())

365743

# Predict Polymer Applications

### We will predict applications given a polymer name. Word vector for a polymer is matched against word vectors from the class of applications

In [4]:
# import tagged data
data = open('../data/Mat-intelligence_data/apl.json', 'r')
tags = json.load(data)
list_tags = list(tags.keys())
list_tags_reduced = [tag for tag in list_tags if ' ' in tag]

In [8]:
# Matrix calculation of chosen word vector and all tagged data and their representations (if existent)
# Save as dict and sort

query_word = 'polyethylene'
dict_list = {}
top_k = 20
for tag in list_tags_reduced:
    if tag in full_model.wv.vocab.keys():
        dict_list[tag] = full_model.wv.similarity(tag, query_word)

sorted_dict = sorted(dict_list.items(), key=operator.itemgetter(1), reverse=True)
keys_only = [item[0] for item in sorted_dict]
keys_only_subset = keys_only[:top_k]
incomplete_entities = [tag for tag in keys_only_subset if '(' in tag and ')' not in tag]
for tag in incomplete_entities: keys_only_subset.remove(tag)
doc = ' ; '.join(keys_only_subset)
chem_entities = Paragraph(doc).cems
chem_entities_string = list(set([c.__str__() for c in chem_entities]))

for tag in chem_entities_string:
    if tag in keys_only_subset:
        keys_only_subset.remove(tag)
keys_only_subset

[]

# Polymer Named Entity Normalization

### Predict chemical named entities (CNE) which are likely to be the same as the word vector of given CNE. At present doesn't work so well for entity normalization and predicts any CNE that might co-occur.

### Normalization: Label all names that represent the same entity

In [121]:
query_word = 'poly(methyl methacrylate)'
dict_list = {}
keys = full_model.wv.similar_by_vector(query_word, topn=5)
keys_only_subset = [item[0] for item in keys]
incomplete_entities = [tag for tag in keys_only_subset if '(' in tag and ')' not in tag]
for tag in incomplete_entities: keys_only_subset.remove(tag)
doc = ' ; '.join(keys_only_subset)
chem_entities = Paragraph(doc).cems
chem_entities_string = list(set([c.__str__() for c in chem_entities]))
#print(len(chem_entities_string))
polymer_NER = []
for tag in chem_entities_string:
    if tag in keys_only_subset:
        polymer_NER.append(tag)
keys_only_subset

['polymethylmethacrylate',
 'poly(methylmethacrylate)',
 'polymethyl methacrylate',
 'Poly(methyl methacrylate)',
 'polystyrene']

# Word2Vec Analogies

In [8]:
full_model.wv.most_similar(positive=['waxd', 'scanning electron microscopy'], negative=['sem'])

[('waxs', 0.6477597951889038),
 ('wide-angle x-ray diffraction', 0.6331572532653809),
 ('saxs', 0.5924482345581055),
 ('wxrd', 0.5798591375350952),
 ('waxrd', 0.5793081521987915),
 ('wide‐angle', 0.5599203705787659),
 ('differential scanning calorimetry', 0.5474841594696045),
 ('x-ray diffraction', 0.5427091717720032),
 ('small-angle x-ray scattering', 0.5336911082267761),
 ('diffractometry', 0.5099477767944336)]

In [None]:
# Evaluate performance of analogies
def word_analogy_performance(file, model_file, best_of=4):
    """
    file: contains analogy ground truth pairs
    model_file: word embedding model file
    best_of: mark output as correct if found in the top best_of predictions
    """
    if 'word2vec' in model_file: 
        model = gensim.models.Word2Vec.load(model_file)
    elif 'fastText' in model_file:
        model = gensim.models.FastText.load(model_file)
    else:
        raise ValueError
    dict_performance = {}
    header = ''
    total_count = 0
    with open(file, mode='r', newline='\n') as f:
        for line in f.readlines():
            if line[0] == ':':
                if total_count != 0 and header != '':
                    dict_performance[header] = {}
                    dict_performance[header]['Accuracy'] = success_count/total_count
                    dict_performance[header]['Dataset size'] = total_count
                header = line.replace(': ', '').replace('\n', '')
                success_count = 0
                total_count = 0
            else:
                analogy_set = line.split(';')
                analogy_set = [token.replace('\n', '').replace(' ', '_') for token in analogy_set]
                if all([token in model.wv.vocab.keys() for token in analogy_set]):
                    total_count += 1
                    if analogy_set[0] in [item[0] for item in model.wv.most_similar(positive =[analogy_set[1], analogy_set[2]], negative = [analogy_set[3]])][:best_of]: success_count += 1
        if total_count != 0 and header != '':
            dict_performance[header] = {}
            dict_performance[header]['Accuracy'] = success_count/total_count
            dict_performance[header]['Dataset size'] = total_count
    return dict_performance
