<a href="https://colab.research.google.com/github/SDS-AAU/UNISTRA-DS-2022/blob/master/static/workshops/2021/UNISTRAw2_sci_pat_link.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip /content/sci_pat_match.zip

Archive:  /content/sci_pat_match.zip
  inflating: patent_nlp_match.csv    
  inflating: __MACOSX/._patent_nlp_match.csv  
  inflating: scopus_neuron.csv       
  inflating: __MACOSX/._scopus_neuron.csv  


In [2]:
!pip install umap-learn -q
!pip install -q hdbscan

[K     |████████████████████████████████| 86 kB 2.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 29.5 MB/s 
[?25h  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 5.2 MB 4.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone


In [3]:
import pandas as pd
import numpy as np

import umap
import hdbscan

import itertools
import random

from collections import Counter

# progress bar
import tqdm

#spacy instantiating English module
import spacy
nlp = spacy.load('en')

In [4]:
data_sci = pd.read_csv('/content/scopus_neuron.csv')
data_pat = pd.read_csv('/content/patent_nlp_match.csv')

In [5]:
data_sci['text'] = data_sci['Title'].str.cat(data_sci['Abstract'].astype(str), sep=' ')

In [6]:
# run progress bare and clean up using spacy but without some heavy parts of the pipeline

%%time
clean_sci = []


pbar = tqdm.tqdm(total=len(data_sci['text']),position=0, leave=True)

for text in nlp.pipe(data_sci['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_sci.append(txt)

  pbar.update(1)

 99%|█████████▉| 1609/1629 [00:03<00:00, 641.58it/s]

CPU times: user 3.3 s, sys: 48 ms, total: 3.35 s
Wall time: 3.65 s


In [7]:
data_sci['text_cl'] = clean_sci

In [9]:
# run progress bare and clean up using spacy but without some heavy parts of the pipeline

%%time
clean_pat = []


pbar = tqdm.tqdm(total=len(data_pat['text']),position=0, leave=True)

for text in nlp.pipe(data_pat['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_pat.append(txt)

  pbar.update(1)

100%|██████████| 1629/1629 [02:38<00:00, 10.28it/s] 
 99%|█████████▉| 4953/5002 [00:12<00:00, 520.77it/s]

CPU times: user 11.7 s, sys: 137 ms, total: 11.9 s
Wall time: 12.3 s


In [10]:
data_pat['text_cl'] = clean_pat

In [11]:
all_sent = clean_pat + clean_sci

In [12]:
# update gensim
!pip install --upgrade gensim -q

[K     |████████████████████████████████| 24.1 MB 5.6 MB/s 
[?25h

In [13]:
# get tooling for Word2Vec model
from gensim.models import Word2Vec

In [14]:
# enable logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
# train word2vec model
w2v_model = Word2Vec(sentences=all_sent, vector_size=300, window=5, min_count=2, workers=2, epochs=5)

2022-03-03 08:53:46,418 : INFO : collecting all words and their counts
2022-03-03 08:53:46,422 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-03 08:53:46,587 : INFO : collected 17960 word types from a corpus of 675841 raw words and 6631 sentences
2022-03-03 08:53:46,589 : INFO : Creating a fresh vocabulary
2022-03-03 08:53:46,665 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 12480 unique words (69.48775055679288%% of original 17960, drops 5480)', 'datetime': '2022-03-03T08:53:46.664997', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-03-03 08:53:46,667 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 670361 word corpus (99.18915839672349%% of original 675841, drops 5480)', 'datetime': '2022-03-03T08:53:46.666973', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# function that does absolutely nothing...
# to be able to use TfidfVectorizer on already tokenized text
def dummy_fun(doc):
    return doc

In [23]:
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings

tfidf_sci = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [24]:
# create TFIDF matrix (we could also just use that one for search)
sci_tfidf = tfidf_sci.fit_transform(data_sci['text_cl'])

In [25]:
# for the whole matrix

sci_w2v_tfidf = sci_tfidf @ w2v_model.wv.vectors

In [27]:
sci_w2v_tfidf.shape

(1629, 300)

In [None]:
umap_reducer_sci = umap.UMAP(random_state=42, n_components=2)
embeddings_sci = umap_reducer_sci.fit_transform(sci_w2v_tfidf)

In [29]:
embeddings_sci

array([[6.946269 , 4.632085 ],
       [6.2202373, 7.006687 ],
       [6.143842 , 3.969535 ],
       ...,
       [2.011928 , 3.8625894],
       [1.7433486, 5.774202 ],
       [1.5920503, 3.8336015]], dtype=float32)

In [30]:
clusterer_sci = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=5)
clusterer_sci.fit(embeddings_sci)
data_sci['cluster'] = clusterer_sci.labels_

In [31]:
data_sci['cluster'].unique()

array([-1, 17,  5, 12,  6, 13, 21, 18, 23, 10, 22,  7,  0, 15,  8,  2,  4,
       14,  3, 16, 19, 11,  9, 20,  1])

In [32]:
import altair as alt

In [33]:
df_plot = pd.DataFrame(embeddings_sci, columns=['x','y'])

In [35]:
df_plot['Title'] = data_sci['Title']
df_plot['doctype'] = data_sci['Document Type']
df_plot['abstract'] = data_sci['Abstract']
df_plot['year'] = data_sci['Year']
df_plot['cluster'] = clusterer_sci.labels_

In [86]:
df_plot = df_plot[df_plot['cluster']!= -1]

In [87]:
alt.Chart(df_plot).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster', scale=alt.Scale(scheme='category20')),
    tooltip=['Title', 'abstract', 'cluster','year']
).properties(
    width=800,
    height=600
).interactive()

In [68]:
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings

tfidf_pat = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [69]:
# create TFIDF matrix (we could also just use that one for search)
pat_tfidf = tfidf_pat.fit_transform(data_pat['text_cl'])

In [70]:
# for the whole matrix

pat_w2v_tfidf = pat_tfidf @ w2v_model.wv.vectors

In [71]:
umap_reducer_pat = umap.UMAP(random_state=42, n_components=2)
embeddings_pat = umap_reducer_pat.fit_transform(pat_w2v_tfidf)

In [72]:
clusterer_pat = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
clusterer_pat.fit(embeddings_pat)
data_pat['cluster'] = clusterer_pat.labels_

In [73]:
data_pat['cluster'].unique()

array([ 58,  -1,  61,  51,  49,  66,  59,  57,  64,  65,  20,  17,  63,
        55,  48,  62,  21,  98,   5,  23,  78,  47,  27,  86,  26,  96,
        90, 132,  99,  28,  24, 103,  33,  37,  38,  95,  94,   4,  16,
        91,  84,  35, 105, 120,  97,  46,  75, 115,  70, 125,  45,  10,
        13,  12,  15,  31,   6,  42,  83,  43,  22,  44,  32,  36,  34,
        68,  52,   8,  67,  25,  87, 112,  85,   7,  80,  93, 111,  89,
         1,  30, 133, 114,  53,  18, 129,  69,  19,   0, 102, 116, 101,
       135,  82, 134, 130,  54,  71,  77, 127, 117, 131, 119, 123, 124,
        81,  92, 122, 106, 128,  88, 109, 121, 113,   9,  73, 126, 110,
       104,  41, 107, 108,  29,   2, 118,  50,  74,  56,  14,   3,  79,
        72, 100,  11,  76,  40,  39,  60])

In [78]:
df_plot_p = pd.DataFrame(embeddings_pat, columns=['x','y'])

In [79]:
df_plot_p['title'] = data_pat['title']
df_plot_p['abstract'] = data_pat['abstract']
df_plot_p['section_id'] = data_pat['section_id']
df_plot_p['subsection_id'] = data_pat['subsection_id']
df_plot_p['cluster'] = clusterer_pat.labels_

In [80]:
df_plot_p = df_plot_p[df_plot_p['cluster']!= -1]

In [81]:
df_plot_p = df_plot_p.sample(1500)

In [158]:
#df_plot_p = df_plot_p[df_plot_p.cluster.isin([43,13])]

In [159]:
alt.Chart(df_plot_p).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster', scale=alt.Scale(scheme='category20')),
    tooltip=['title', 'abstract', 'section_id','subsection_id', 'cluster']
).properties(
    width=800,
    height=600
).interactive()

In [None]:
w2v_dict = list(w2v_model.wv.key_to_index.keys())

In [112]:
tf_ix = data_sci[data_sci['cluster'] == 9].index
tf_w_ix = np.flip(np.argsort(np.mean(sci_tfidf[tf_ix], axis=0))).tolist()[0][:100]
topic_kws = [w2v_dict[i] for i in tf_w_ix]

In [104]:
tf_ix = data_pat[data_pat['cluster'] == 31].index
tf_w_ix = np.flip(np.argsort(np.mean(pat_tfidf[tf_ix], axis=0))).tolist()[0][:100]
topic_kws = [w2v_dict[i] for i in tf_w_ix]

In [107]:
# slightly more complex function that includes preprocessing with Spacy
# TFIDF transformation and embeddings

def get_tfidf_vector(word2vec_model, model_tfidf, query):
  
    if len(query) >= 1:
      words = model_tfidf.transform([query])
      return words @ word2vec_model.wv.vectors
    else:
        return []

In [106]:
!pip install -q annoy

[?25l[K     |▌                               | 10 kB 11.5 MB/s eta 0:00:01[K     |█                               | 20 kB 14.5 MB/s eta 0:00:01[K     |█▌                              | 30 kB 12.5 MB/s eta 0:00:01[K     |██                              | 40 kB 8.9 MB/s eta 0:00:01[K     |██▌                             | 51 kB 3.8 MB/s eta 0:00:01[K     |███                             | 61 kB 4.4 MB/s eta 0:00:01[K     |███▌                            | 71 kB 4.7 MB/s eta 0:00:01[K     |████                            | 81 kB 4.6 MB/s eta 0:00:01[K     |████▋                           | 92 kB 5.1 MB/s eta 0:00:01[K     |█████                           | 102 kB 4.5 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 4.5 MB/s eta 0:00:01[K     |██████                          | 122 kB 4.5 MB/s eta 0:00:01[K     |██████▋                         | 133 kB 4.5 MB/s eta 0:00:01[K     |███████                         | 143 kB 4.5 MB/s eta 0:00:01[K  

In [108]:
from annoy import AnnoyIndex

# instatiate a search tree (with shape n/300)
t = AnnoyIndex(pat_w2v_tfidf.shape[1], 'angular') 

In [109]:
# we will build that on disk (can reuse later if we store it somwhere)

t.on_disk_build('patents_search_tree.annoy')

True

In [110]:
# now we add all our vectors - line by line to the tree
# along with an index (here i - running index)
for i in tqdm.tqdm(range(pat_w2v_tfidf.shape[0]),position=0, leave=True):
    t.add_item(i, pat_w2v_tfidf[i])

100%|██████████| 5002/5002 [00:00<00:00, 8732.76it/s]


In [111]:
# now we build the search tree (that creates partitions within the data-a bit like clustering)
# thereafter search will be performed within the nearest partitions (that reduces search time A LOT)
t.build(50, n_jobs=-1)

True

In [114]:
v = get_tfidf_vector(w2v_model,tfidf_sci,['circuit',
 'circadian',
 'neuron',
 'sleep',
 'cell',
 'remodel',
 'neural',
 'progenitor'])

In [116]:
r = t.get_nns_by_vector(v[0], n=10, include_distances=True)

In [118]:
r

([416, 2463, 2500, 4782, 317, 4264, 2465, 275, 3017, 4696],
 [0.5064176917076111,
  0.5072077512741089,
  0.5102561712265015,
  0.5119566321372986,
  0.5186595320701599,
  0.5299279093742371,
  0.5476865172386169,
  0.548832893371582,
  0.5672420859336853,
  0.5680707097053528])

In [129]:
data_pat.loc[r[0]]['title']

4696    Approximate functional matching in electronic ...
317     Methods to enhance T-cell mediated immune resp...
4782    Automatic selection of lead configuration for ...
4264         Electrochemical molecular recognition probes
2367                Cysteine variants of interferon gamma
3017    Methods for producing a non human model for ao...
2463    Methods for eliminating at least a substantial...
2500    Molecules with effects on cellular development...
416     Compositions and methods for the treatment of ...
4830    Method and apparatus for detecting space-time ...
Name: title, dtype: object

In [123]:
topic_search_strings = [random.sample(topic_kws, 25) for _ in range(10)]

In [125]:
topic_search_vecs = [get_tfidf_vector(w2v_model,tfidf_sci, l) for l in topic_search_strings]

In [126]:
search_results_ix = []
search_results_dist = []

for v in topic_search_vecs:
  r = t.get_nns_by_vector(v[0], n=10, include_distances=True)
  search_results_ix.extend(r[0])
  search_results_dist.extend(r[1])

In [128]:
pd.DataFrame(zip(search_results_ix,search_results_dist))

Unnamed: 0,0,1
0,4696,0.401806
1,4782,0.423270
2,317,0.433664
3,275,0.443718
4,416,0.444183
...,...,...
95,3017,0.472337
96,2463,0.478844
97,2500,0.484417
98,416,0.488867


In [133]:
search_results_ix = []
search_results_dist = []
search_results_clusternr = []


for j in data_sci['cluster'].unique():
  tf_ix = data_sci[data_sci['cluster'] == j].index
  tf_w_ix = np.flip(np.argsort(np.mean(sci_tfidf[tf_ix], axis=0))).tolist()[0][:100]
  topic_kws = [w2v_dict[i] for i in tf_w_ix]
  
  topic_search_strings = [random.sample(topic_kws, 25) for _ in range(10)]
  topic_search_vecs = [get_tfidf_vector(w2v_model,tfidf_sci, l) for l in topic_search_strings]

  for v in topic_search_vecs:
    r = t.get_nns_by_vector(v[0], n=10, include_distances=True)
    search_results_ix.extend(r[0])
    search_results_dist.extend(r[1])
    search_results_clusternr.extend(len(r[1])*[j])
  


In [138]:
s_t_link_df = pd.DataFrame(zip(search_results_ix,search_results_dist,search_results_clusternr), columns=['pat_id','cos_distance','sci_cluster'])

In [141]:
s_t_link_df.drop_duplicates(subset=['pat_id','sci_cluster'], inplace=True)

In [143]:
s_t_link_df = s_t_link_df[s_t_link_df.sci_cluster != -1]

In [146]:
s_t_link_df.groupby('sci_cluster').cos_distance.mean().sort_values()

sci_cluster
16    0.311112
17    0.362655
7     0.362742
0     0.371039
18    0.382606
3     0.391652
23    0.396331
13    0.397485
15    0.402209
21    0.413689
1     0.419554
14    0.427418
8     0.430849
6     0.434500
19    0.440671
5     0.450633
9     0.451295
11    0.456745
20    0.464432
12    0.468907
4     0.485935
10    0.492660
2     0.504170
22    0.509843
Name: cos_distance, dtype: float64

In [152]:
pat_ix_select = list(s_t_link_df[s_t_link_df.sci_cluster == 16].pat_id)

In [157]:
data_pat.loc[pat_ix_select].cluster.value_counts()

-1     26
 43     6
 13     6
 42     4
 22     3
 12     1
 36     1
Name: cluster, dtype: int64

In [160]:
data_pat

Unnamed: 0,patent_id,claim_len,section_id,subsection_id,group_id,subgroup_id,text,number,country,date,abstract,title,kind,num_claims,uuid,text_cl,cluster
0,8688141,257,H,H04,H04L,H04L51/20,A method comprising: receiving one or more net...,8688141,US,4/1/2014,"In certain embodiments, a method for proximity...",System and method for providing communication ...,B2,26,0bcf7529-ebc1-11ea-a344-121df0c29c1e,"[method, comprise, receive, network, identifie...",58
1,8688140,231,"G, G","G01, G01","G01S, G01S","G01S3/48, G01S3/043",A method for locating a radio frequency tag us...,8688140,US,4/1/2014,Determination of the location and bearing of a...,Radio frequency tag location system and method,B2,28,03637655-ebba-11ea-a344-121df0c29c1e,"[method, locate, radio, frequency, tag, spatia...",-1
2,8688138,147,G,G01,G01S,G01S5/0257,A method to determine a position of a mobile d...,8688138,US,4/1/2014,Methods and apparatuses for location determina...,Method and apparatus for location determinatio...,B2,26,1f40508d-ebd2-11ea-a344-121df0c29c1e,"[method, determine, position, mobile, device, ...",61
3,8688137,228,"H, H, H","H04, H04, H04","H04L, H04W, H04L","H04L5/0064, H04W72/085, H04L5/0007",A mobile communication system in which a mobil...,8688137,US,4/1/2014,A mobile communication system in which a mobil...,Communication apparatus and communication method,B2,15,0fe3badb-ebb9-11ea-a344-121df0c29c1e,"[mobile, communication, system, mobile, statio...",51
4,8688135,204,"H, H, H","H04, H04, H04","H04W, H04L, H04L","H04W72/082, H04L5/0062, H04L5/001",A method for implementing a carrier aggregatio...,8688135,US,4/1/2014,The present invention discloses a method and b...,Method and base station for implementing carri...,B2,19,0ccad016-ebd1-11ea-a344-121df0c29c1e,"[method, implement, carrier, aggregation, comp...",49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4997,8450885,144,"H, H","H02, H02","H02K, H02K","H02K9/22, H02K41/031","A coolant-cooled linear motor, comprising: an ...",8450885,US,5/28/2013,In a coolant-cooled linear motor includes an a...,Coolant-cooled linear motor,B2,20,0e9d30ca-ebb8-11ea-a344-121df0c29c1e,"[coolant, cool, linear, motor, comprise, armat...",-1
4998,8450884,251,"Y, F, F, H, Y, F, H","Y02, F02, F01, H02, Y10, F01, H02","Y02B, F02C, F01K, H02K, Y10T, F01B, H02P","Y02B10/30, F02C6/16, F01K13/02, H02K7/1815, Y1...",An apparatus comprising: a first reversible ga...,8450884,US,5/28/2013,A compressed-air energy storage system accordi...,Compressed air energy storage system utilizing...,B2,38,0bb934e8-ebcf-11ea-a344-121df0c29c1e,"[apparatus, comprise, reversible, gas, compres...",16
4999,8450882,326,B,B60,B60Q,B60Q1/00,An energization control apparatus comprising: ...,8450882,US,5/28/2013,An energization control apparatus includes a c...,Energization control apparatus,B2,2,30791ba2-ebbf-11ea-a344-121df0c29c1e,"[energization, control, apparatus, comprise, c...",4
5000,8450881,252,"Y, H, H, Y","Y10, H01, H01, Y10","Y10T, H01H, H01H, Y10T","Y10T307/74, H01H2085/0283, H01H9/106, Y10T307/826",An apparatus for protecting an electric line i...,8450881,US,5/28/2013,The present invention relates to an apparatus ...,Apparatus and method for protecting an electri...,B2,10,057af1d3-ebc6-11ea-a344-121df0c29c1e,"[apparatus, protect, electric, line, vehicle, ...",37
