In [1]:
import logging
import gensim
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
data_path = "D:MachineLearning/PURE_patents/"
os.listdir(data_path)

['just_text.csv', 'patent_raw.csv', 'train_corpus.pkl', 'updated-patent-data']

In [18]:
patents = pd.read_csv(data_path + 'patent_raw.csv', header = None)
patents.head(10)

Unnamed: 0,0,1
0,3930271,Golf glove. A golf glove is disclosed having a...
1,3930272,Crib leg lock. A lock for a height-adjustable ...
2,3930273,Bed safety side rail arrangement. A bed safety...
3,3930274,Assembly for use in recreational activities. T...
4,3930275,Method of fabricating a slipper. A novel slipp...
5,3930276,Wheel spinning and vehicle conveying apparatus...
6,3930277,Mobile floor sweeper. A Mobile Floor Sweeper i...
7,3930278,Paintbrush and guard attachment for edging. A ...
8,3930279,Rubber windshield wiper blades having increase...
9,3930280,Bottle insert for product container. Leaks are...


In [19]:
print('First Patent:\n' + patents.iloc[0][1])

First Patent:
Golf glove. A golf glove is disclosed having an extra finger pocket between the index and middle finger pockets for securing one finger of one hand of a golf player between the fingers of the player's other hand.


In [20]:
just_text = patents[1] 
just_text.head()
just_text.to_csv(data_path+'just_text.csv', index = False)

In [21]:
just_text = pd.read_csv(data_path + 'just_text.csv')
print(just_text.size)

4966215


In [22]:
just_text.head()

Unnamed: 0,1
0,Golf glove. A golf glove is disclosed having a...
1,Crib leg lock. A lock for a height-adjustable ...
2,Bed safety side rail arrangement. A bed safety...
3,Assembly for use in recreational activities. T...
4,Method of fabricating a slipper. A novel slipp...


In [18]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in tqdm(enumerate(f)):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(data_path + "just_text.csv"))
#test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

7018019it [10:10, 11486.84it/s]


In [19]:
import pickle

with open(data_path+'train_corpus.pkl', 'wb') as f:
    pickle.dump(train_corpus, f)

In [None]:
print(train_corpus[:25])

In [102]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40, workers = 16, window = 10)
model.build_vocab(train_corpus)

2020-10-26 19:37:15,414 : INFO : collecting all words and their counts
2020-10-26 19:37:15,415 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-10-26 19:37:15,673 : INFO : PROGRESS: at example #10000, processed 1013175 words (3928429/s), 21667 word types, 10000 tags
2020-10-26 19:37:15,935 : INFO : PROGRESS: at example #20000, processed 2047658 words (3959154/s), 28922 word types, 20000 tags
2020-10-26 19:37:16,186 : INFO : PROGRESS: at example #30000, processed 3033079 words (3942710/s), 34401 word types, 30000 tags
2020-10-26 19:37:16,436 : INFO : PROGRESS: at example #40000, processed 4042397 words (4050182/s), 38901 word types, 40000 tags
2020-10-26 19:37:16,685 : INFO : PROGRESS: at example #50000, processed 5070054 words (4135548/s), 42468 word types, 50000 tags
2020-10-26 19:37:16,934 : INFO : PROGRESS: at example #60000, processed 6102593 words (4164736/s), 45719 word types, 60000 tags
2020-10-26 19:37:17,190 : INFO : PROGRESS: at example #70

In [103]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-10-26 20:00:54,093 : INFO : training model with 16 workers on 245139 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2020-10-26 20:00:55,149 : INFO : EPOCH 1 - PROGRESS: at 0.10% examples, 516704 words/s, in_qsize 32, out_qsize 0
2020-10-26 20:00:56,159 : INFO : EPOCH 1 - PROGRESS: at 0.20% examples, 550833 words/s, in_qsize 31, out_qsize 0
2020-10-26 20:00:57,167 : INFO : EPOCH 1 - PROGRESS: at 0.31% examples, 555024 words/s, in_qsize 32, out_qsize 0
2020-10-26 20:00:58,170 : INFO : EPOCH 1 - PROGRESS: at 0.42% examples, 571605 words/s, in_qsize 31, out_qsize 0
2020-10-26 20:00:59,177 : INFO : EPOCH 1 - PROGRESS: at 0.53% examples, 568970 words/s, in_qsize 31, out_qsize 0
2020-10-26 20:01:00,212 : INFO : EPOCH 1 - PROGRESS: at 0.64% examples, 570721 words/s, in_qsize 31, out_qsize 0
2020-10-26 20:01:01,219 : INFO : EPOCH 1 - PROGRESS: at 0.76% examples, 581727 words/s, in_qsize 31, out_qsize 0
2020-10-26 20:01:02,264 : INFO : EPOCH 1 - PROGRESS: at 

In [104]:
model.save("patents_d2v_large.model")

2020-10-27 04:55:28,199 : INFO : saving Doc2Vec object under patents_d2v_large.model, separately None
2020-10-27 04:55:28,206 : INFO : storing np array 'syn1neg' to patents_d2v_large.model.trainables.syn1neg.npy
2020-10-27 04:55:28,439 : INFO : storing np array 'vectors' to patents_d2v_large.model.wv.vectors.npy
2020-10-27 04:55:28,661 : INFO : storing np array 'vectors_docs' to patents_d2v_large.model.docvecs.vectors_docs.npy
2020-10-27 04:55:38,100 : INFO : saved patents_d2v_large.model


In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40, workers = 16, window = 10)
model = gensim.models.doc2vec.Doc2Vec.load("patents_d2v_large.model")

2020-11-02 01:39:50,728 : INFO : loading Doc2Vec object from patents_d2v_large.model
2020-11-02 01:39:51,376 : INFO : loading vocabulary recursively from patents_d2v_large.model.vocabulary.* with mmap=None
2020-11-02 01:39:51,377 : INFO : loading trainables recursively from patents_d2v_large.model.trainables.* with mmap=None
2020-11-02 01:39:51,377 : INFO : loading syn1neg from patents_d2v_large.model.trainables.syn1neg.npy with mmap=None
2020-11-02 01:39:51,609 : INFO : loading wv recursively from patents_d2v_large.model.wv.* with mmap=None
2020-11-02 01:39:51,611 : INFO : loading vectors from patents_d2v_large.model.wv.vectors.npy with mmap=None
2020-11-02 01:39:51,812 : INFO : loading docvecs recursively from patents_d2v_large.model.docvecs.* with mmap=None
2020-11-02 01:39:51,812 : INFO : loading vectors_docs from patents_d2v_large.model.docvecs.vectors_docs.npy with mmap=None
2020-11-02 01:39:55,781 : INFO : loaded patents_d2v_large.model


In [110]:
import numpy.linalg as la

'''Function to compare the document vectors of two input strings. 
   This will return the cosine similarity metric and the ratio of
   the first magnitude to the second in a tuple'''
def vec_compare(s1, s2):
    doc = [s1, s2]
    str_vec = [doc[i].split(" ") for i in range(len(doc))]
    vector = [model.infer_vector(i) for i in str_vec]
    normed_vec = [i/la.norm(i) for i in vector]

    return(cos_sim(vector[0], vector[1]), la.norm(vector[0])/ la.norm(vector[1]))
    #print(vector)
    
def cos_sim(v1, v2):
    v1 = v1.copy()
    v2 = v2.copy()
    v1 /= la.norm(v1)
    v2 /= la.norm(v2)
    return (np.dot(v1, v2))

In [114]:
import numpy.linalg as la

p1 = "A bed safety side rail arrangement which includes two opposite articulated side rail assemblies selectively pivotally movable from an upper safety position to a lowered position, as for patient transfer and handling and/or bedmaking tasks."
       #"according to all known laws of aviation, there is no way a bee should be able to fly. Its wings are too small to get"
        #+ "its fat little body off the ground."
p2 = "The upper edge of the bracket is slotted to receive a latch pivotally connected to a crib corner post"
      
compr = vec_compare(p1, p2)
print("Similarity: " + str(compr[0]), "Magnitude Ratio: " + str(compr[1]))

Similarity: 0.23298876 Magnitude Ratio: 1.7229836


In [None]:
ranks = []
second_ranks = []
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

  0%|                                                                                                                                                                      | 0/7018019 [00:00<?, ?it/s]2020-10-27 05:17:59,528 : INFO : precomputing L2-norms of doc weight vectors
  0%|                                                                                                                                                                      | 0/7018019 [00:12<?, ?it/s]

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

In [23]:
import pickle

train_corpus = pickle.load( open( data_path+'train_corpus.pkl', "rb" ) )
print(type(train_corpus))

<class 'list'>


In [82]:
indexes = np.load("indexes.npy")
def get_train_from_indexes_vec(idx):
    return ' '.join(train_corpus[indexes[idx]][0])
    
print(indexes[82])
print(len(train_corpus))
pat1 = ' '.join(train_corpus[indexes[82]][0])
pat2 = ' '.join(train_corpus[indexes[367]][0])
compr = vec_compare(pat1, pat2)
print("Similarity: " + str(compr[0]), "Magnitude Ratio: " + str(compr[1]))

print(' '.join(train_corpus[indexes[82]][0]))
print('\n')
print(' '.join(train_corpus[indexes[367]][0]))

6261761
7018019
Similarity: 0.18482207 Magnitude Ratio: 0.9125746
light emitting device light emitting device includes leadframe light emitting unit transparent encapsulant and fluorescent colloid layer the light emitting unit is disposed on the leadframe the transparent encapsulant covers the light emitting unit wherein the transparent encapsulant has concave on which at least one reflective surface is disposed the fluorescent colloid layer is disposed outside the transparent encapsulant wherein chamber is formed between the fluorescent colloid layer and the transparent encapsulant the light generated by the light emitting unit is reflected by the reflective surface and guided to side wall of the fluorescent colloid layer


dielectric resonator dielectric resonator comprises shield electrode defining resonant space and cylindrical dielectric resonator element disposed and supported fixedly in the resonant space to which an input and output are coupled into the hollow portion of the di

In [106]:
def find_close(tol, self_idx = 0):
    i = 0
    max_sim = 0
    max_i = 0
    for i in tqdm(range(50000)):
        if i == self_idx:
            continue
            
        compr =  vec_compare(' '.join(train_corpus[indexes[self_idx]][0]), ' '.join(train_corpus[indexes[i]][0]))[0]
        if compr > tol:
            return (i)
        if compr > max_sim:
            max_sim = compr
            max_i = i
            
        if compr > .3:
            print(compr)
            print(i)
        i+=1
    print("Did not meet threshold, maximum sim found: " + str(max_sim) + "at index: " + str(max_i))
    return max_i

In [107]:
close_idx = find_close(.8, self_idx = 27458)
print(close_idx)

  7%|█████▎                                                                       | 3434/50000 [00:54<11:50, 65.57it/s]

0.30146784
3425


 18%|█████████████▋                                                               | 8850/50000 [02:19<11:04, 61.90it/s]

0.35522297
8841


 18%|█████████████▋                                                               | 8878/50000 [02:20<10:40, 64.21it/s]

0.31497282
8865


 18%|██████████████                                                               | 9168/50000 [02:24<10:33, 64.49it/s]

0.31636614
9159


 22%|████████████████▋                                                           | 10965/50000 [02:53<09:54, 65.67it/s]

0.32836926
10954


 28%|████████████████████▉                                                       | 13812/50000 [03:38<09:43, 62.02it/s]

0.30048054
13808


 30%|██████████████████████▌                                                     | 14807/50000 [03:54<09:13, 63.56it/s]

0.34412456
14800


 43%|████████████████████████████████▌                                           | 21453/50000 [05:39<07:31, 63.22it/s]


KeyboardInterrupt: 

In [None]:
print(get_train_from_indexes_vec(27458))
print(get_train_from_indexes_vec(3425))

In [101]:
print(get_train_from_indexes_vec(close_idx))

stepwise concentrations adding an adapter other than said adapters to


In [120]:
all_vecs = np.load("patents_d2v_large.model.docvecs.vectors_docs.npy")
print(all_vecs.shape)
#print(all_vecs[0])

(7018019, 300)


In [117]:
def find_close_off_RAM(tol, self_idx = 0):
    i = 0
    max_sim = 0
    max_i = 0
    for i in tqdm(range(indexes.shape[0])):
        if i == self_idx:
            continue
            
        compr = cos_sim(all_vecs[indexes[self_idx]], all_vecs[indexes[i]])
        if compr > tol:
            return (i)
        if compr > max_sim:
            max_sim = compr
            max_i = i
            
        if compr > .3:
            print(compr)
            print(i)
        i+=1
    print("Did not meet threshold, maximum sim found: " + str(max_sim) + "at index: " + str(max_i))
    return max_i

In [118]:
close_vec = find_close_off_RAM(.8, 44)

100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 55126.92it/s]

Did not meet threshold, maximum sim found: 0.24288322at index: 36384





In [141]:
vector = model.infer_vector(train_corpus[indexes[44]][0])
vec2 = model.infer_vector(train_corpus[indexes[40]][0])
print(all_vecs[indexes[40]])
print(vec2)
print(cos_sim(all_vecs[indexes[40]], vector))

[-2.3571055e-04  5.8255892e-04 -1.5801504e-03 -2.4654405e-04
  5.5769377e-04 -4.1453983e-04  6.7296188e-04  1.4946079e-04
 -3.4556090e-04  1.5992140e-03  5.3310383e-04  1.0136786e-03
 -1.0314050e-04 -3.1466261e-05  7.5737160e-04 -5.5627280e-04
 -1.1700977e-03  7.0290617e-04 -4.6567959e-04 -6.3623168e-04
 -1.5493665e-03  1.6339640e-03 -4.9109472e-04  5.6337216e-04
 -7.0038217e-04 -3.1151288e-04  6.9158658e-04  5.7933497e-04
 -1.6066076e-03 -1.4610352e-03  1.5473214e-03 -9.5585396e-04
 -1.1478139e-03  7.7993271e-04  4.6342015e-04  1.9849020e-04
  6.7649246e-04  1.6030745e-03  8.6329720e-04 -7.2920567e-04
 -1.3239853e-04  9.1131317e-04  1.1159446e-03  1.2949927e-03
 -1.3508885e-03 -4.2738300e-04 -8.5351162e-04 -9.9985907e-04
 -1.3378721e-03  1.4273233e-03 -1.2504666e-03  2.3554176e-06
 -5.5157853e-04 -1.4101882e-03 -1.5331325e-03  6.7049521e-04
  3.3059160e-04 -2.4177814e-04  1.4971307e-03 -6.0015940e-04
 -5.4116208e-05  1.4640335e-03 -1.1006839e-03 -1.0826100e-03
  2.1722673e-04 -1.12980

In [130]:
train_corpus[indexes[44]][0]

[]