In [1]:
from dataloader import get_dataset, TrainDataset, EvalDataset ,TestDataset 
from evaluation import Eval_MR
from torch.utils.data import DataLoader
from model import TransE
import torch
import numpy as np 
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd 
import math 

GPU = True
EPOCHS_PER_SEED = 5
LR = 0.01
LR_DECAY_EPOCH = 5


def adjust_learning_rate(optim, decay):
    for param_group in optim.param_groups:
        param_group['lr'] *= decay


class Train:
    def __init__(self, data_name):
        self.dataset = get_dataset(data_name)
        self.n_entities = self.dataset.n_entities
        self.n_relations = self.dataset.n_relations

    def prepareData(self):
        print("Perpare dataloader")
        self.train = TrainDataset(self.dataset)
        self.trainloader = None
        self.valid = EvalDataset(self.dataset)
        self.validloader = DataLoader(self.valid, batch_size=self.valid.n_triples, shuffle=False)
        self.test = TestDataset(self.dataset)
        self.testloader = DataLoader(self.test, batch_size=self.test.n_triples, shuffle=False)

    def prepareModel(self):
        print("Perpare model")
        self.model = TransE(self.n_entities, self.n_relations, embDim=100)
        if GPU:
            self.model.cuda()

    def saveModel(self):
        pickle.dump(self.model.get_emb_weights(), open('emb_weight_20220125.pkl', 'wb'))

    def fit(self):
        optim = torch.optim.Adam(self.model.parameters(), lr=LR)
        minLoss = float("inf")
        bestMR = float("inf")
        GlobalEpoch = 0
        for seed in range(100):
            print(f"# Using seed: {seed}")
            self.train.regenerate_neg_samples(seed=seed)
            self.trainloader = DataLoader(self.train, batch_size=1024, shuffle=True, num_workers=4)
            for epoch in range(EPOCHS_PER_SEED):
                GlobalEpoch += 1
                for sample in self.trainloader:
                    if GPU:
                        pos_triples = torch.LongTensor(sample['pos_triples']).cuda()
                        neg_triples = torch.LongTensor(sample['neg_triples']).cuda()
                    else:
                        pos_triples = torch.LongTensor(sample['pos_triples'])
                        neg_triples = torch.LongTensor(sample['neg_triples'])

                    self.model.normal_emb()

                    loss = self.model(pos_triples, neg_triples)
                    if GPU:
                        lossVal = loss.cpu().item()
                    else:
                        lossVal = loss.item()

                    optim.zero_grad()
                    loss.backward()
                    optim.step()

                    if minLoss > lossVal:
                        minLoss = lossVal
                MR = Eval_MR(self.validloader, "L2", **self.model.get_emb_weights())
                if MR < bestMR:
                    bestMR = MR
                    print('save model ')
                    self.saveModel()
                    # FILE = "model.pth"
                    # torch.save(self.model, FILE)
                print(f"Epoch: {epoch + 1}, Total_Train: {GlobalEpoch}, Loss: {lossVal}, minLoss: {minLoss},"
                      f"MR: {MR}, bestMR: {bestMR}")
                if GlobalEpoch % LR_DECAY_EPOCH == 0:
                    adjust_learning_rate(optim, 0.96)


# if __name__ == '__main__':
#     train = Train('FB15k-237')
#     train.prepareData()
#     train.prepareModel()
#     train.fit()


In [2]:
train = Train('FB15k-237')

Reading train triples...
Finished. Read 272115 train triples.
Reading valid triples...
Finished. Read 17535 valid triples.
Reading test triples...
Finished. Read 20466 test triples.


In [3]:
train.prepareData()

Perpare dataloader
|Train|: 544230
|Valid|: 17535
|Test|: 20466


In [4]:
train.prepareModel() # 每次init 模型權重都會變，直接predict 結果皆會不同

Perpare model


In [5]:
train.fit()

# Using seed: 0
save model 
Epoch: 1, Total_Train: 1, Loss: 0.47142496705055237, minLoss: 0.456105500459671,MR: 901.8328485885372, bestMR: 901.8328485885372
save model 
Epoch: 2, Total_Train: 2, Loss: 0.2819611132144928, minLoss: 0.2741110920906067,MR: 642.4573139435415, bestMR: 642.4573139435415
save model 
Epoch: 3, Total_Train: 3, Loss: 0.24167214334011078, minLoss: 0.18671663105487823,MR: 537.0740233818078, bestMR: 537.0740233818078
save model 
Epoch: 4, Total_Train: 4, Loss: 0.20377230644226074, minLoss: 0.1531406044960022,MR: 481.182378100941, bestMR: 481.182378100941
save model 
Epoch: 5, Total_Train: 5, Loss: 0.19179368019104004, minLoss: 0.14133676886558533,MR: 458.9081836327345, bestMR: 458.9081836327345
# Using seed: 1
save model 
Epoch: 1, Total_Train: 6, Loss: 0.18574243783950806, minLoss: 0.14133676886558533,MR: 418.5479897348161, bestMR: 418.5479897348161
save model 
Epoch: 2, Total_Train: 7, Loss: 0.1855708807706833, minLoss: 0.1350994110107422,MR: 414.57787282577704, b

KeyboardInterrupt: 

# predict 從這裡開始

In [6]:
from torch.utils.data import dataloader
def Eval_MR2(evalloader: dataloader, #simMeasure,
             **kwargs
            ):
    R = 0
    N = 0
    for triples in evalloader:
        triples = triples.numpy()
        h0, r0, t0 = triples[:, 0], triples[:, 1], triples[:, 2]
        h = np.take(kwargs['e_emb'], indices=h0, axis=0) # len(h) = 17535
        r = np.take(kwargs['r_emb'], indices=r0, axis=0) # len(r) = 17535
        t = np.take(kwargs['e_emb'], indices=t0, axis=0) # len(t) = 17535
        # simScore = calSimilarity(h+r, kwargs['e_emb'], simMeasure)
        # ranks = calRank(simScore, t)
        # R += np.sum(ranks)
        # N += ranks.shape[0]
    return h , r ,t ,[h0,r0,t0]#,R / N

In [7]:
# Load
with open('emb_weight_20220125.pkl', 'rb') as f:
    get_emb_weights = pickle.load(f)

In [8]:
#h1 , r1 ,t1  = Eval_MR2(train.testloader,  **train.model.get_emb_weights())

In [9]:
h , r ,t ,index_ = Eval_MR2(train.testloader,  **get_emb_weights )

In [10]:
h

array([[ 0.07764608,  0.01966429,  0.06293502, ...,  0.00037094,
        -0.06863769, -0.165984  ],
       [-0.08491039,  0.10034832,  0.09443039, ..., -0.12913279,
        -0.12995197,  0.10308704],
       [-0.12658772,  0.11487907,  0.08221328, ...,  0.05396236,
        -0.04265753,  0.11657944],
       ...,
       [ 0.01783859,  0.00921735, -0.12230428, ...,  0.19053403,
         0.11407951, -0.04722907],
       [ 0.01779311, -0.08040216,  0.03185505, ..., -0.07723701,
         0.17749844, -0.05336093],
       [-0.10716069,  0.08135753,  0.09835683, ...,  0.08480228,
        -0.08312382,  0.08235199]], dtype=float32)

In [12]:
r

array([[ 0.03153617, -0.14209162, -0.2708895 , ...,  0.26972923,
        -0.14447203,  0.02785578],
       [-0.03264477, -0.24892859, -0.26211855, ...,  0.21051855,
        -0.17860016,  0.09617757],
       [-0.27369118, -0.19407567,  0.04544272, ..., -0.04968257,
        -0.09847382,  0.07431498],
       ...,
       [-0.10441644,  0.24610111,  0.2785821 , ..., -0.20137322,
         0.12099136, -0.04000435],
       [-0.29845864, -0.25763604, -0.10449464, ...,  0.1126167 ,
         0.05744866, -0.22404554],
       [-0.24515048, -0.00726994, -0.21764009, ...,  0.20149611,
        -0.0949312 ,  0.28120804]], dtype=float32)

# None 值 predict 

In [12]:
# Load
with open('emb_weight_20220125.pkl', 'rb') as f:
    get_emb_weights = pickle.load(f)
def get_e_emb(indices , **kwargs
            ): # 輸入entity編碼後回傳結果
    r = np.take(kwargs['e_emb'], indices=indices, axis=0)
    return r
def get_r_emb(indices , **kwargs
            ): # 輸入relation編碼後回傳結果
    r = np.take(kwargs['r_emb'], indices=indices, axis=0)
    return r

In [15]:
# None 值 entity會有權重
get_e_emb(indices=14541 ,  **get_emb_weights )

array([ 0.11687996,  0.17215681, -0.06266562, -0.08467472,  0.09452362,
       -0.06867414, -0.11942425,  0.14816137, -0.09255349,  0.14969264,
        0.02969402,  0.08838042, -0.07064307,  0.1032944 , -0.08379799,
       -0.09309568, -0.08961347,  0.04722863,  0.10178427, -0.04194168,
       -0.08710565, -0.08487288,  0.07398315,  0.0674083 , -0.02656736,
       -0.11092649,  0.01499554, -0.00128763,  0.10285509, -0.14714028,
       -0.11825156,  0.05726233,  0.11264145, -0.02677056, -0.11430357,
        0.10821625,  0.08950803, -0.12698469, -0.07415128, -0.06277288,
       -0.05494906, -0.00508549, -0.10830335,  0.12483893,  0.1147564 ,
        0.11925494, -0.10688307, -0.1036757 , -0.09014288,  0.11465847,
       -0.13576616, -0.10710268,  0.13876052, -0.09632992,  0.01957146,
       -0.08852017, -0.10227995, -0.14775316,  0.1485116 ,  0.00161598,
       -0.0022591 , -0.08747811, -0.13465168, -0.05597105,  0.05682161,
        0.1384316 ,  0.13115364, -0.13857238,  0.14383332,  0.12

In [16]:
# None 值 rel 會有權重
get_r_emb(indices=237 ,  **get_emb_weights )

array([-9.81507301e-02,  1.65457201e+00, -7.29150295e-01, -9.72482383e-01,
        2.50903666e-01,  5.15618622e-01,  1.44718575e+00, -2.80199870e-02,
       -5.34180105e-01,  1.22995472e+00,  7.82899797e-01, -1.07642543e+00,
       -5.79468489e-01,  9.34254825e-02, -3.48847717e-01, -5.45396924e-01,
        8.11276674e-01, -2.31555760e-01,  2.92052571e-02,  7.11219311e-02,
        8.83849204e-01, -1.52462626e+00, -4.19762358e-02,  5.02303600e-01,
        4.77711946e-01,  1.19224489e+00, -9.69938993e-01,  1.65196776e+00,
       -7.17873045e-04, -1.09269857e-01, -4.43713933e-01, -1.52361952e-02,
        7.61415601e-01,  2.26736903e+00,  1.36820400e+00,  4.95398253e-01,
        4.06732969e-02,  5.34399927e-01, -3.97180796e-01,  1.50779915e+00,
       -1.47803366e+00,  1.60024881e-01,  2.75863390e-02, -8.14473867e-01,
       -1.23271143e+00,  1.47423863e+00, -2.29464129e-01,  1.29794729e+00,
       -8.18897188e-01,  1.58943796e+00, -4.32124257e-01,  2.78536499e-01,
       -1.52678952e-01, -

# 成效計算

In [13]:
pred = h+r

In [14]:
from tqdm import tqdm

In [15]:
pred.shape

(20466, 100)

In [16]:
tt = t.tolist()

In [17]:
# 先把T的向量製作成df
df_t = pd.DataFrame()
df_t['t'] = tt

In [18]:
# 透過lambda 方式 ，一次性計算所有的distance
df_t['cosine_distance'] = df_t.t.apply(lambda x : cosine_similarity(  pred  , [x ]))

In [19]:
def sort_fun(h): # 計算好的distance 由小到大進行排序
    h = np.array( h  ).reshape(-1,).tolist() # 減掉一個dims  [[1],[2],[3]] -> [1,2,3]
    my_dict = dict( list(enumerate(np.sort(h)[::-1])) )
    h_dict =  dict(zip(my_dict.values(), my_dict.keys()))
    # for i,j in enumerate(np.sort(h)[::-1]): # 將排序的結果冠上index後再 進行mapping
    #     h_dict[j] = i
    return [h_dict[i] for i in h ] 

In [20]:
df_t['rank'] =  df_t['cosine_distance'].apply( lambda x : sort_fun(x))

In [21]:
# 成效計算
rank_mean = []  
for i in range(len(df_t)):
    x = df_t.iloc[i]['rank']
    rank_mean.append(x[i]) # 因為是對角地所以才用df_t[i][i]計算

In [22]:
#平均分數
np.mean( rank_mean )

453.14961399394116

In [23]:
# 總長度數
len(df_t)

20466

In [24]:
# 平均排除了98%的entity
np.mean( rank_mean ) / len(df_t)

0.022141581842760733

In [25]:
df_t = df_t.reset_index()

In [26]:
df_t.columns = ['ent_index', 't', 'cosine_distance', 'rank']

In [27]:
df_t.head(3)

Unnamed: 0,ent_index,t,cosine_distance,rank
0,0,"[-0.13216862082481384, -0.046637676656246185, ...","[[0.9303413390262458], [-0.0343790433947064], ...","[51, 12794, 16458, 12450, 12450, 9841, 11012, ..."
1,1,"[0.07238393276929855, -0.15301825106143951, -0...","[[0.0014534614623641431], [0.9651918395564838]...","[7982, 161, 12537, 3998, 3998, 10219, 9432, 15..."
2,2,"[-0.1519051343202591, -0.08208147436380386, 0....","[[-0.2995563647446651], [-0.12159386912620143]...","[19809, 14328, 756, 5722, 5722, 14556, 17147, ..."


# Predict : 第N筆資料 h + r => reutrn前N筆數據

In [161]:
def moder_predict_by_row(df , row_num , top_n):

    return df['rank'].apply(lambda x : x[row_num] ).sort_values().index[:top_n].tolist()

In [165]:
moder_predict_by_row(df =df_t  , row_num = 1657, top_n = 3)

[6592, 11535, 13060]

# 輸入pred 數值，回傳最接近的entity

In [150]:
def model_predict(pred_value ,entity_emb_w ,top_n ):
    
    df = pd.DataFrame()
    df['var'] = entity_emb_w.tolist()
    df['var_dis'] = df['var'].apply(lambda x :cosine_similarity( [x],[ pred_value])[0][0] )
    return df.sort_values('var_dis',ascending=False).head(top_n).index.tolist()  
            #df.sort_values('var_dis',ascending=False).head(top_n).var_dis.tolist()  

In [151]:
pred = h + r
model_predict(pred[0]  ,e_emb_w ,top_n = 4 ) # pred結果去跟所有向量排序，並依照大小，找出最像似的前4個人\[[[[[[[[[[[[[]]]]]]]]]]]]]]ㄣ

[10543, 4353, 5287, 13397]

In [143]:
model_predict(t[0]  ,e_emb_w ,top_n = 4 ) # 答案t向量去排序離他最近的幾個人，發現到pred的第四個才是正確答案

[13397, 4353, 13443, 10543]

In [149]:
model_predict(t[0]  ,e_emb_w ,top_n = 4 ) # 答案t向量去排序離他最近的幾個人，並評估向量相似度，可看前幾個都方常接近

[1.0000000000000004,
 0.9926496947435993,
 0.9924586673716429,
 0.9917024273408873]