In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import pickle
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from tqdm.notebook import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
f = open('top4681_companies_relationship_file.pkl','rb')
adj_matrix = pickle.load(f)
adj_matrix = pd.DataFrame.from_dict(adj_matrix)
adj_matrix = adj_matrix.apply(lambda x: x/x.max(), axis=0)
adj_matrix.values

array([[0.53327085, 0.        , 0.        , ..., 0.        , 0.        ,
        0.005     ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00109341, 0.        , 0.23592773, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.18235294, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.75675676,
        0.        ],
       [0.00124961, 0.        , 0.        , ..., 0.        , 0.        ,
        0.63      ]])

In [3]:
def one_hot(labels, class_size):
    targets = torch.zeros(class_size, device=device)
    targets[labels] = 1
    return targets

def loss_function(recon_x, x, mu, logvar):
    BCE = F.mse_loss(recon_x, x.view(-1, 1))
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [1]:
embedding = nn.Embedding(4681, 64).to(device)
datas = []
labels = []
for i in tqdm(range(adj_matrix.shape[0])):
    label = embedding(torch.tensor(i, device=device)).data.cpu().numpy()
    for j in range(adj_matrix.shape[1]):
        tmp = torch.tensor(adj_matrix.values[i][j]).to(torch.float32)
        #print(tmp)
        #tmp = torch.log10(tmp)
        #print(tmp)
        #tmp = sigmoid(tmp).detach()
        datas.append(tmp)
        
        labels.append(label)


NameError: name 'nn' is not defined

In [12]:
class patentDataset(Dataset):
    def __init__(self, datas, labels):
        self.datas = datas
        self.labels = labels
    def __len__(self):
        return len(self.datas)
    def __getitem__(self, index):
        return self.datas[index], self.labels[index]

In [13]:
class CVAE(nn.Module):
    def __init__(self, feature_size, latent_size, class_size):
        super(CVAE, self).__init__()
        self.feature_size = feature_size
        self.class_size = class_size

        # encode
        self.fc1  = nn.Linear(feature_size + class_size, 400)
        self.fc21 = nn.Linear(400, latent_size)
        self.fc22 = nn.Linear(400, latent_size)

        # decode
        self.fc3 = nn.Linear(latent_size + class_size, 400)
        self.fc4 = nn.Linear(400, feature_size)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def encode(self, x, c): # Q(z|x, c)
        '''
        x: (bs, feature_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([x, c], 1) # (bs, feature_size+class_size)
        h1 = self.relu(self.fc1(inputs))
        z_mu = self.fc21(h1)
        z_var = self.fc22(h1)
        return z_mu, z_var

    def reparametrize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = std.data.new(std.size()).normal_()
            return eps.mul(std) + mu
        else:
            return mu

    def decode(self, z, c): # P(x|z, c)
        '''
        z: (bs, latent_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([z, c], 1) # (bs, latent_size+class_size)
        h3 = self.relu(self.fc3(inputs))
        return self.fc4(h3)

    def forward(self, x, c):
#         mu, logvar = self.encode(x.view(-1, 28*28), c)
        mu, logvar = self.encode(x.view(-1, 1), c)
        z = self.reparametrize(mu, logvar)
        return self.decode(z, c), mu, logvar

In [31]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (datas, labels) in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()

        labels = labels.to(device)
        datas = datas.to(torch.float32).to(device)

        recon_batch, mu, logvar = model(datas, labels)

        loss = loss_function(recon_batch, datas, mu, logvar)
        loss.backward()
        train_loss += loss.data.item()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(datas), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.data.item()))


In [33]:
train_dataset = patentDataset(datas, labels)
train_loader = DataLoader(train_dataset, batch_size = 128)
model = CVAE(1, 20, 64).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-6)
for epoch in range(3):
    train(epoch)
    

HBox(children=(FloatProgress(value=0.0, max=171186.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=171186.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=171186.0), HTML(value='')))




In [34]:
CVAE_feature = []
for i in tqdm(range(4681)):
    #input_x = torch.tensor(adj_matrix.values[i][0]/100000, device=device).to(torch.float32)
    #label = embedding(torch.tensor(i, device=device))
    #print(input_x.view(-1,1), label.view(-1, 32))
    #mu, var = model.encode(input_x.view(-1, 1), label.view(-1, 32))
    
    
    #samples = torch.zeros(20, 10000)
    #for k in range(20):
    #    tmp_mean = mu.detach().cpu().numpy()[0][k]
    #    tmp_std = torch.exp(var.detach().cpu()[0][k]*0.5).item()
    #    samples[k] = torch.empty(10000).normal_(mean = tmp_mean, std = tmp_std)
    #samples = samples.transpose(0, 1)
    samples = torch.empty(10000, 20).normal_(mean=0,std=1)
    samples = samples.to(device)   
    
    samples_label = []
    for j in range(10000):
        samples_label.append(labels[i*4681])
    samples_label = torch.tensor(samples_label).to(device)
    
    gen_ans = model.decode(samples, samples_label)
    feature = (sum(gen_ans.detach().cpu())/10000).item()
    CVAE_feature.append(feature)
  

HBox(children=(FloatProgress(value=0.0, max=4681.0), HTML(value='')))




In [44]:
pd.DataFrame(data=CVAE_feature).to_csv('new_CVAE_embedding.csv')

## Write to feature

In [48]:
patent_data = pd.read_csv('dataset_year_2013.csv')
patent_data = patent_data.fillna(0)
patent_data

Unnamed: 0,PatentNumber,Title,Abstract,IssuedYear,AppliedYear,AssigneeCNT,AssigneeCountryCNT,InventorCNT,InventorCountryCNT,FirstAssignee,...,AVG_Back_distance,AVG_For_distance_5Y,Industry_Originality,Industry_Generality_5Y,Ass_Originality,Ass_Generality_5Y,Mean_longevity,Max_longevity,InvInv_Cultural_Distance,AssAss_Cultural_Distance
0,6020086,Accumulator device for an electric and/or elec...,An accumulator device for an electric or elec...,2000,1997,1,1,1.0,1,U.S. Philips Corporation,...,8244.8516,7319.2052,0.21875,0.000000,0.843750,0.625000,12.0387,6881.0,0.0,0.0
1,6020136,Identification of functional transcription fac...,The present invention relates to the identifi...,2000,1998,1,1,1.0,1,The Trustees of the University of Pennsylvania,...,0.0000,0.0000,0.00000,0.000000,0.000000,0.000000,7.1918,2625.0,0.0,0.0
2,6020250,Stacked devices,Chips having subsurface structures within or ...,2000,1998,1,1,1.0,1,International Business Machines Corporation,...,285.1092,5454.8186,0.00000,0.000000,0.000000,0.740741,10.6997,6251.0,0.0,0.0
3,6012550,Bypass device for automatic transmission fluid...,A bypass device for ATF (automatic transmissi...,2000,1997,1,1,1.0,1,Hyundai Motor Company,...,9199.9030,10547.4188,0.53125,0.444444,0.800000,0.000000,11.4721,6839.0,0.0,0.0
4,6010537,Zoom lens system having an image blur compensa...,"A zoom lens system has, from the object side,...",2000,1998,1,1,2.0,1,"Minolta Co., Ltd.",...,668.5856,685.9623,0.00000,0.000000,0.500000,0.244898,10.0201,6979.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89148,6166862,Designing method of zoom optical system,A designing method of a zoom optical system w...,2000,1998,1,1,2.0,1,"Minolta Co., Ltd.",...,286.6882,373.7317,0.00000,0.000000,0.716049,0.000000,10.0973,5768.0,0.0,0.0
89149,6164175,Method and arrangement for operating rotating ...,A method and an arrangement for operating a r...,2000,1997,1,1,3.0,1,SMS Schloemann-Siemag Aktiengesellschaft,...,8208.9129,0.0000,0.00000,0.000000,0.500000,0.000000,10.9315,4382.0,0.0,0.0
89150,6162420,Use of glyceryl triacetate for treating onycho...,A nail varnish comprising glyceryl triacetate...,2000,1999,1,1,3.0,1,Hoechst Aktiengesellschaft,...,1156.4565,1169.9326,0.00000,0.444444,0.775510,0.444444,8.6589,5817.0,0.0,0.0
89151,6166941,Relaxed layout for storage nodes for dynamic r...,A memory cell structure (10) includes a plura...,2000,1999,1,1,2.0,1,Texas Instruments Incorporated,...,10220.5573,2069.7645,0.37500,0.000000,0.750000,0.000000,7.4707,5061.0,0.0,0.0


In [52]:
patent_data['Han_name'].unique()

array([' U.S. Philips Corporation',
       ' The Trustees of the University of Pennsylvania',
       ' International Business Machines Corporation', ...,
       ' Ferring B.V.', ' Nucor Corporation', ' Paul Wurth S.A.'],
      dtype=object)

In [2]:
CVAE_embeddings = []
for name in patent_data['Han_name']:
    for i in range(4681):
        if name == adj_matrix.columns[i]:
            CVAE_embeddings.append(CVAE_feature[i])
CVAE_embeddings         

NameError: name 'patent_data' is not defined

In [59]:
patent_data['embedd_CVAE'] = CVAE_embeddings
patent_data.to_csv('dataset_year_2013_with_CVAE_embedding.csv', index=False)
patent_data

Unnamed: 0,PatentNumber,Title,Abstract,IssuedYear,AppliedYear,AssigneeCNT,AssigneeCountryCNT,InventorCNT,InventorCountryCNT,FirstAssignee,...,AVG_For_distance_5Y,Industry_Originality,Industry_Generality_5Y,Ass_Originality,Ass_Generality_5Y,Mean_longevity,Max_longevity,InvInv_Cultural_Distance,AssAss_Cultural_Distance,embedd_CVAE
0,6020086,Accumulator device for an electric and/or elec...,An accumulator device for an electric or elec...,2000,1997,1,1,1.0,1,U.S. Philips Corporation,...,7319.2052,0.21875,0.000000,0.843750,0.625000,12.0387,6881.0,0.0,0.0,0.054269
1,6020136,Identification of functional transcription fac...,The present invention relates to the identifi...,2000,1998,1,1,1.0,1,The Trustees of the University of Pennsylvania,...,0.0000,0.00000,0.000000,0.000000,0.000000,7.1918,2625.0,0.0,0.0,0.045799
2,6020250,Stacked devices,Chips having subsurface structures within or ...,2000,1998,1,1,1.0,1,International Business Machines Corporation,...,5454.8186,0.00000,0.000000,0.000000,0.740741,10.6997,6251.0,0.0,0.0,0.082125
3,6012550,Bypass device for automatic transmission fluid...,A bypass device for ATF (automatic transmissi...,2000,1997,1,1,1.0,1,Hyundai Motor Company,...,10547.4188,0.53125,0.444444,0.800000,0.000000,11.4721,6839.0,0.0,0.0,0.032267
4,6010537,Zoom lens system having an image blur compensa...,"A zoom lens system has, from the object side,...",2000,1998,1,1,2.0,1,"Minolta Co., Ltd.",...,685.9623,0.00000,0.000000,0.500000,0.244898,10.0201,6979.0,0.0,0.0,0.034544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89148,6166862,Designing method of zoom optical system,A designing method of a zoom optical system w...,2000,1998,1,1,2.0,1,"Minolta Co., Ltd.",...,373.7317,0.00000,0.000000,0.716049,0.000000,10.0973,5768.0,0.0,0.0,0.034544
89149,6164175,Method and arrangement for operating rotating ...,A method and an arrangement for operating a r...,2000,1997,1,1,3.0,1,SMS Schloemann-Siemag Aktiengesellschaft,...,0.0000,0.00000,0.000000,0.500000,0.000000,10.9315,4382.0,0.0,0.0,-0.003184
89150,6162420,Use of glyceryl triacetate for treating onycho...,A nail varnish comprising glyceryl triacetate...,2000,1999,1,1,3.0,1,Hoechst Aktiengesellschaft,...,1169.9326,0.00000,0.444444,0.775510,0.444444,8.6589,5817.0,0.0,0.0,0.017485
89151,6166941,Relaxed layout for storage nodes for dynamic r...,A memory cell structure (10) includes a plura...,2000,1999,1,1,2.0,1,Texas Instruments Incorporated,...,2069.7645,0.37500,0.000000,0.750000,0.000000,7.4707,5061.0,0.0,0.0,0.027007


In [22]:
raw_train_data = pd.read_csv('test_data_with_company_ID.csv')
company_ID = raw_train_data['company_ID']

raw_train_data['embedd_CVAE'] = [CVAE_feature[i-1] for i in company_ID]

In [23]:
raw_train_data = raw_train_data.drop(['company_ID'], axis=1)
raw_train_data.to_csv('test_data_with_CVAE_embedding.csv', index=False)