In [1]:
import logging
import gensim
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
data_path = "D:MachineLearning/PURE_patents/"
os.listdir(data_path)

['corpus.pkl', 'just_text.csv', 'patent_raw.csv']

In [2]:
patents = pd.read_csv(data_path + 'patent_raw.csv', header = None)
patents.head(10)

Unnamed: 0,0,1
0,3930271,Golf glove. A golf glove is disclosed having a...
1,3930272,Crib leg lock. A lock for a height-adjustable ...
2,3930273,Bed safety side rail arrangement. A bed safety...
3,3930274,Assembly for use in recreational activities. T...
4,3930275,Method of fabricating a slipper. A novel slipp...
5,3930276,Wheel spinning and vehicle conveying apparatus...
6,3930277,Mobile floor sweeper. A Mobile Floor Sweeper i...
7,3930278,Paintbrush and guard attachment for edging. A ...
8,3930279,Rubber windshield wiper blades having increase...
9,3930280,Bottle insert for product container. Leaks are...


In [3]:
print('First Patent:\n' + patents.iloc[0][1])

First Patent:
Golf glove. A golf glove is disclosed having an extra finger pocket between the index and middle finger pockets for securing one finger of one hand of a golf player between the fingers of the player's other hand.


In [4]:
just_text = patents[1] 
just_text.head()
just_text.to_csv(data_path+'just_text.csv', index = False)

In [5]:
just_text = pd.read_csv(data_path + 'just_text.csv')
print(just_text.size)

4966215


In [9]:
just_text.head()

Unnamed: 0,1
0,Golf glove. A golf glove is disclosed having a...
1,Crib leg lock. A lock for a height-adjustable ...
2,Bed safety side rail arrangement. A bed safety...
3,Assembly for use in recreational activities. T...
4,Method of fabricating a slipper. A novel slipp...


In [18]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in tqdm(enumerate(f)):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(data_path + "just_text.csv"))
#test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

7018019it [10:10, 11486.84it/s]


In [19]:
import pickle

with open(data_path+'train_corpus.pkl', 'wb') as f:
    pickle.dump(train_corpus, f)

In [None]:
print(train_corpus[:25])

In [30]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40, workers = 16, window = 10)
model.build_vocab(train_corpus)

2020-10-25 23:08:36,617 : INFO : PROGRESS: at example #6650000, processed 523507297 words (6374307/s), 379493 word types, 6650000 tags
2020-10-25 23:08:36,778 : INFO : PROGRESS: at example #6660000, processed 524471659 words (6042110/s), 379779 word types, 6660000 tags
2020-10-25 23:08:36,944 : INFO : PROGRESS: at example #6670000, processed 525512163 words (6307575/s), 380057 word types, 6670000 tags
2020-10-25 23:08:37,103 : INFO : PROGRESS: at example #6680000, processed 526487437 words (6167592/s), 380377 word types, 6680000 tags
2020-10-25 23:08:37,264 : INFO : PROGRESS: at example #6690000, processed 527496421 words (6275677/s), 380674 word types, 6690000 tags
2020-10-25 23:08:37,422 : INFO : PROGRESS: at example #6700000, processed 528494651 words (6362347/s), 380986 word types, 6700000 tags
2020-10-25 23:08:37,580 : INFO : PROGRESS: at example #6710000, processed 529488234 words (6332426/s), 381317 word types, 6710000 tags
2020-10-25 23:08:37,740 : INFO : PROGRESS: at example #

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [67]:
model.save("patents_d2v.model")

2020-10-26 12:32:21,380 : INFO : saving Doc2Vec object under patents_d2v.model, separately None
2020-10-26 12:32:21,383 : INFO : storing np array 'syn1neg' to patents_d2v.model.trainables.syn1neg.npy
2020-10-26 12:32:21,415 : INFO : storing np array 'vectors' to patents_d2v.model.wv.vectors.npy
2020-10-26 12:32:21,451 : INFO : storing np array 'vectors_docs' to patents_d2v.model.docvecs.vectors_docs.npy
2020-10-26 12:32:22,938 : INFO : saved patents_d2v.model


In [101]:
import numpy.linalg as la

doc = ["Circular knitting machine. To permit selective needle projection, under control of a patterning arrangement, the needle jacks are made for rocking movement about an axis transverse to the direction of needle projection during knitting, and the camming system for the needle jacks is formed with two tracks, located one above the other (by a deeper cut, or camming elements of different heights) or, longitudinally staggered, one adjacent to the other, the cam tracks being arranged for selected projection of the needles to tuck or knit position.",
       #"A method and apparatus scedhule uplink transmissions with reduced latency."
       #"according to all known laws of aviation, there is no way a bee should be able to fly. Its wings are too small to get"
        #+ "its fat little body off the ground."
       "Self balancing table. In a circular knitting machine, a self-balancing, circular bobbin table which has formed therein a circular groove within which a plurality of balls freely roll to dynamically balance the table as it is rotated during the knitting operations."
      ]
str_vec = [doc[i].split(" ") for i in range(len(doc))]
vector = [model.infer_vector(i) for i in str_vec]
normed_vec = [i/la.norm(i) for i in vector]

print(np.dot(normed_vec[0], normed_vec[1]))
print(vector)

0.252743
[array([-0.28087735,  0.38621008,  0.8508384 ,  1.038248  ,  0.07309564,
        1.1977931 , -0.5088015 , -1.8267745 ,  0.13977815, -1.8833544 ,
       -0.39452165, -0.62033504,  0.24143456, -1.688273  , -1.0880688 ,
       -0.10534336, -2.47248   ,  0.14199567,  0.16780388,  0.04640283,
       -1.1132685 , -2.8003633 , -0.31105733, -0.33777684, -2.2041044 ,
        1.3713781 ,  0.2356057 , -0.2390185 , -1.7446188 , -1.2072209 ,
       -2.6661847 , -0.13144748,  0.14559008,  0.8166889 , -1.8442206 ,
       -0.94594324, -1.9050679 , -0.09199888,  1.3757935 ,  0.5763035 ,
        1.9986854 , -0.7205064 , -2.7098856 , -3.057     , -0.3758834 ,
        1.6166793 ,  0.7956759 ,  0.496455  ,  0.88827294, -0.35504362],
      dtype=float32), array([ 0.7498945 , -0.2488806 , -0.04200224, -2.1140752 , -0.5353309 ,
        0.18104102,  0.03363936,  1.0553697 ,  1.4649472 , -0.9062824 ,
        0.60248774,  0.8979739 , -0.18848997, -1.6236618 , -0.22950743,
        0.6535997 ,  0.2749011 

In [94]:
import numpy.linalg as la

doc = ["Mary had a little lamb", "A little lamb had Mary"]
str_vec = [doc[i].split(" ") for i in range(len(doc))]
#str_vec = train_corpus[:2]
print(str_vec)
vector = [model.infer_vector(i) for i in str_vec]
normed_vec = [i/la.norm(i) for i in vector]

print(np.dot(normed_vec[0], normed_vec[1]))
print(vector)

[['Mary', 'had', 'a', 'little', 'lamb'], ['A', 'little', 'lamb', 'had', 'Mary']]
0.95729846
[array([-0.44410354, -0.13347703, -0.04725597,  0.1758879 , -0.3121682 ,
       -0.18816602,  0.4953201 , -0.00496292, -0.29904738,  0.1341791 ,
        0.64246714,  0.29677597,  0.03050934, -0.2745325 ,  0.16800937,
        0.27837908,  0.6460266 , -0.64909667,  0.00759146, -0.80458236,
       -0.92419046, -0.07583129, -0.0360688 ,  0.11015264,  0.59309864,
        0.27324286,  0.8246222 ,  0.19211707,  0.26320645,  0.21116605,
       -0.16574436, -0.53247976, -0.71851504, -0.25713804, -0.42713666,
        0.15142445,  0.18009858, -0.08587842,  0.48084018,  0.27250993,
       -0.15549062,  0.03779576, -0.3705872 , -0.2828869 , -0.06823739,
        0.11681806,  0.22616307,  0.3048242 ,  0.06752926,  0.05729926],
      dtype=float32), array([-0.27283373, -0.1134159 , -0.08214434,  0.12623751, -0.5206901 ,
       -0.11811545,  0.519865  ,  0.06701396, -0.2987128 ,  0.20219547,
        0.5933177 , 

In [93]:
ranks = []
second_ranks = []
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

  0%|                                                                                                                                                                      | 0/7018019 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

0
