In [8]:
%matplotlib inline

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

In [10]:
import json
import pickle

In [11]:
import os
import gensim
import pandas as pd

In [12]:
df = pd.read_csv('SpaceNASA-Sol#_Updated.csv', encoding='iso-8859-1')

In [13]:
def getTextTokens():
    allTokens = []
    descriptions = df['Description']
    with open('SpaceNASA-Sol#_Updated.csv') as f:
        for i, line in enumerate(f):
#            if(i>100000):
#                continue    
            jLineText = line.strip()
            for i in range(0, 16):
                tokens = gensim.utils.simple_preprocess(str(descriptions.iloc[i]))
#             tokens = gensim.utils.simple_preprocess(jLineText)
            if(i%25000==0):
                print(i)   
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
    return allTokens

In [14]:
trainingCorpus = list(getTextTokens())
print(trainingCorpus[:5])

[TaggedDocument(words=['nasa', 'technology', 'transfer', 'program', 'solicits', 'inquiries', 'from', 'companies', 'interested', 'in', 'obtaining', 'license', 'rights', 'to', 'commercialize', 'manufacture', 'and', 'market', 'the', 'following', 'technology', 'license', 'rights', 'may', 'be', 'issued', 'on', 'an', 'exclusive', 'or', 'nonexclusive', 'basis', 'and', 'may', 'include', 'specific', 'fields', 'of', 'use', 'nasa', 'provides', 'no', 'funding', 'in', 'conjunction', 'with', 'these', 'potential', 'licenses', 'the', 'technology', 'nasa', 'langley', 'research', 'center', 'has', 'developed', 'metallic', 'material', 'that', 'can', 'be', 'embedded', 'into', 'structural', 'alloys', 'to', 'enhance', 'nondestructive', 'evaluation', 'nde', 'of', 'structure', 'current', 'nde', 'tools', 'such', 'as', 'eddy', 'current', 'probes', 'and', 'others', 'can', 'have', 'some', 'difficulties', 'detecting', 'small', 'flaws', 'in', 'certain', 'materials', 'and', 'structures', 'also', 'using', 'them', 'can

In [None]:
#with open('sam-20211210-tokens-v2.dat', 'wb') as f:
#    pickle.dump(trainingCorpus, f, protocol=2)

In [15]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=150, min_count=2, epochs=50, workers=4)

In [16]:
model.build_vocab(trainingCorpus)

In [17]:
model.train(trainingCorpus, total_examples=model.corpus_count, epochs=model.epochs)

In [18]:
vector = model.infer_vector(['Space', 'Aeronautics', 'NASA', 'proposals'])
print(vector[:5])

[ 0.00065047  0.00251295 -0.00155167 -0.00314516 -0.00069632]


In [19]:
ranks = []
second_ranks = []
for doc_id in range(0, 16):
    inferred_vector = model.infer_vector(trainingCorpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [20]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({3: 1, 4: 1, 8: 1, 14: 1, 10: 1, 12: 1, 1: 1, 15: 1, 5: 1, 11: 1, 9: 1, 2: 1, 13: 1, 7: 1, 6: 1, 0: 1})


In [21]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(trainingCorpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(trainingCorpus[sims[index][0]].words)))

Document (15): «nasa technology transfer program solicits inquiries from companies interested in obtaining license rights to commercialize manufacture and market the following technology license rights may be issued on an exclusive or nonexclusive basis and may include specific fields of use nasa provides no funding in conjunction with these potential licenses the technology nasa langley research center has developed metallic material that can be embedded into structural alloys to enhance nondestructive evaluation nde of structure current nde tools such as eddy current probes and others can have some difficulties detecting small flaws in certain materials and structures also using them can be costly time consuming and labor intensive often resulting in significant downtime in the case of examination of machinery and vehicles this innovation is to embed particles that react to strain with easily detected acoustic emissions and change in magnetic properties to express interest in this op

In [None]:
model.save("SpaceNASA-Sol#_5.model")

In [None]:
import gc

gc.collect()