# Code to reproduce the findings in the paper:
S.Sarica & J.Luo. Knowledge Burden and the Future of Innovation with Artificial Intelligence

Note: Takes a very long time to run (a week maybe)

In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gensim
import base64
import itertools
import random
import shutil
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import os

### Load the word2vec model that is trained on patent database
Refer to S. Sarica, J. Luo, K. L. Wood, TechNet: Technology semantic network based on patent data. Expert Syst. Appl. 142 (2020), doi:10.1016/j.eswa.2019.112995.

In [None]:
# model_path = "../../Full_patent_w2v_model_th02_min2_s600_w10.txt"
# you can download this model (~ 28GB) from following dropbox folder:
# https://www.dropbox.com/sh/yu2z4sf3bcmkqrb/AABun5qFT8XZncHipEB1TDj0a?dl=0
# download word_embeddings_*.txt files only to a sub-folder, 
# run the following cell to create a single file to be imported
# then delete the downloaded files

download_folder = './600/'
num_files = 404

with open ("./data/word_embeddings.txt", 
           'w', encoding = 'utf-8') as f:
    for i in range(num_files):
        with open(download_folder+'word_embeddings_'+ str(i)+'.txt', 'r', encoding = 'utf-8') as f1:
            temp = f1.readlines()
        print(f'{i+1}/404 done!}')
        f.writelines(temp)
#delete the download folder
shutil.rmtree('./600')

In [None]:
#convert that single file to word2vec format and import it
model_path = datapath(os.getcwd()+"/data/word_embeddings.txt")
tmp_file = get_tmpfile(os.getcwd()+'/Full_patent_w2v_model_th02_min2_s600_w10')
glove2word2vec(model_path, tmp_file)
#delete the unnecessary file
os.remove("./data/word_embeddings.txt") 
model = gensim.models.KeyedVectors.load_word2vec_format(tmp_file, binary=False)

In [None]:
#load vocabulary:index dictionary
with open('./data/vocab_index_tn01.pkl', 'rb') as f:
    vocab_index = pickle.load(f)
index_vocab = {item[1]:item[0] for item in vocab_index.items()}

In [None]:
#this dictionary includes a list for each vocabulary index where list
#counts the number of occurrence of the corresponding term years
#from 1976 to 2017
with open('./data/vocab_years_dict_TNv0.1_encoded.pkl', 'rb') as f:
    vocab_years_dict = pickle.load(f)

### Forming arrays for collecting count statistics for concepts / year later

In [None]:
year_terms = np.zeros(len(range(1976,2018))) #number of terms appearing in years
year_new_terms = np.zeros(len(range(1977,2018))) #number of new terms in years
year_new_terms_ = [[] for x in range(1976,2018)] #index of new terms in years
for key in vocab_years_dict.keys():
    temp = np.array(vocab_years_dict[key])
    key_years = np.array([1 if temp[i]>0 else 0 for i in range(len(temp))])
    year_terms = np.add(year_terms, key_years)
    if temp[0]==0:
        temp1 = np.where(key_years[1:] == 1)
        if temp1[0].size>0:
            year_new_terms[temp1[0][0]] += 1
            year_new_terms_[temp1[0][0]+1].append(key)
    else:
        year_new_terms_[0].append(key)
year_cumsum = np.cumsum([year_terms[0]]+year_new_terms.tolist())

year_cumsum and year_new_terms arrays hold the necessary information to
generate the Fig. 1A in the manuscript

### Semantic similarity measurements
We use different sample sizes 5000, 2000, 1000, 500<br>
For each year we create 100 different samples<br>
We then calculate the mean and standard deviation of these samples <br>

In [None]:
def simInter(matrix, M):
    N = len(matrix)
    m = np.array([x[N-M:] for x in matrix[:N-M]])
    return np.mean(m[m>0])

In [None]:
# calculate semantic similarity for different sample sizes

Ns = [5000, 2000, 1000, 500] #sample sizes
iter_ = 100 #number of samples per year

prop_new_cum = [year_new_terms[i]/year_cumsum[i+1] for i in range(4, 2017-1976)]
prior_terms = list(itertools.chain.from_iterable(year_new_terms_[:5]))

mean_sim = [[[0 for x in range(iter_)] for y in range(1981,2017)] for z in Ns]
mean_sim_inter = [[[0 for x in range(iter_)] for y in range(1981,2017)] for z in Ns]


for i in tqdm(range(2017-1981)):
    #calculate number of new terms for this year
    Ms = [int(np.ceil(prop_new_cum[i]*x)) for x in Ns]
      
    for j in range(iter_):
        #select an initial random terms by using the largest sample size
        priors = random.sample(prior_terms, Ns[0]-Ms[0])
        news = random.sample(year_new_terms_[5+i], Ms[0])
        total = priors+news
        print(i, j)
        matrix = np.zeros((Ns[0], Ns[0]))
        
        for k in range(Ns[0]):
            for t in range(k+1, Ns[0]):
                weight = model.similarity(index_vocab[total[k]], index_vocab[total[t]])
                matrix[k][t] = weight
                matrix[t][k] = weight
        
        mean_sim[0][i][j] = np.mean(matrix[matrix>0])
        mean_sim_inter[0][i][j] = simInter(matrix, Ms[0])
        
        ind_ic = individual_IC(matrix, r)
        for w in range(len(r)):
            mean_ic[0][w][i][j] = np.mean([x[w] for x in ind_ic])
            mean_ic_news[0][w][i][j] = np.mean([x[w] for x in ind_ic[:Ns[0]-Ms[0]]])
        
        for w in range(1,len(Ns)):
            rand_priors = random.sample(priors, Ns[w]-Ms[w])
            rand_news = random.sample(news, Ms[w])
            rand_priors_inds = [priors.index(x) for x in rand_priors]
            rand_news_inds = [news.index(x) for x in rand_news]
            total_ind = rand_priors_inds + rand_news_inds
            _matrix = np.array([[matrix[x][y] for x in total_ind] for y in total_ind])
            mean_sim[w][i][j] = np.mean(_matrix[_matrix>0])
            mean_sim_inter[w][i][j] = simInter(_matrix, Ms[w])
                
    
    prior_terms += year_new_terms_[5+i]
        

mean_sim and mean_sim_inter arrays hold the information to generate the Fig.1B
in the manuscript and Fig S3 in the supplementary material

### Mean additional Information Content measurement
We use different sample sizes 5000, 2000, 1000, 500<br>
For each year we create 100 different samples<br>
We then calculate the mean and standard deviation of these samples <br>

In [None]:
V= len(model.wv.vocab)
cum_terms = {}
most_prox_years = [[] for x in range(len(Ns))]
K = 1000000

for i in tqdm(range(2018-1976)):
    if i>=5 and i<41:
        temp = random.sample(year_new_terms_[i], Ns[0])
        for j in tqdm(range(len(temp))):
            try:
                temp1 = model.wv.most_similar(index_vocab[temp[j]], topn = K)
                for x in temp1:
                    if cum_terms.get(vocab_index[x[0]]):
                        most_prox.append(x[1])
                        break
            except:
                temp1 = model.wv.most_similar(index_vocab[temp[j]], topn = min(K*10, V))
                for x in temp1:
                    if cum_terms.get(vocab_index[x[0]]):
                        break
        most_prox_years[0].append(most_prox)
        for k in range(1,len(Ns)):
            most_prox_years[k].append(random.sample(most_prox, Ns[k]))
        
    if i%10 == 0 and i>0:
        K /= 10
        K= int(K)
    cum_terms.update({x:1 for x in year_new_terms_[i]})
    

log2 of the material in the array most_prox_years is used to produce
Fig 1C and Fig S4 in the manuscript and supplementary materials, respectively.