In [2]:
from tqdm import tqdm
import copy

import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from transformers import DebertaModel, DebertaTokenizer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
datapath="D:/paper_code/B-003/downstream_datasets/WOS_dataset/WebOfScience/Meta-data/Data.xlsx"

raw_df = pd.read_excel(datapath)

In [7]:
raw_df

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."
...,...,...,...,...,...,...,...
46980,5,50,122,Medical,Sports Injuries,Karate; Verletzungsrisiko; Sportverletzung; P...,Zusammenfassung Hintergrund: Karate erfreut si...
46981,0,15,15,CS,Data structures,Z-Wave; Wireless; Embedded systems; Internet ...,Z-Wave is an implementation of home automation...
46982,5,38,110,Medical,Cancer,Antifouling biosensor; Peptide; Electrochemis...,Zwitterionic peptides were anchored to a condu...
46983,0,10,10,CS,Distributed computing,High Performance Computing; Parallel Computin...,ZY-3 has been acquiring high quality imagery s...


In [5]:
field_Y = raw_df.loc[:, "Y"]

In [10]:
field_Y[4]

115

In [3]:
modelpath="../ptm/deberta-base"

model = DebertaModel.from_pretrained(modelpath, return_dict=True).to(DEVICE)
tokenizer = DebertaTokenizer.from_pretrained(modelpath)

Some weights of the model checkpoint at ../ptm/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
raw_df

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."
...,...,...,...,...,...,...,...
46980,5,50,122,Medical,Sports Injuries,Karate; Verletzungsrisiko; Sportverletzung; P...,Zusammenfassung Hintergrund: Karate erfreut si...
46981,0,15,15,CS,Data structures,Z-Wave; Wireless; Embedded systems; Internet ...,Z-Wave is an implementation of home automation...
46982,5,38,110,Medical,Cancer,Antifouling biosensor; Peptide; Electrochemis...,Zwitterionic peptides were anchored to a condu...
46983,0,10,10,CS,Distributed computing,High Performance Computing; Parallel Computin...,ZY-3 has been acquiring high quality imagery s...


In [None]:
raw_df.loc[:50, ""]

In [5]:
for i, text in enumerate(tqdm(raw_df["Abstract"])):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    outputs = model(**inputs)
    mean_pool_fea = torch.mean(input=outputs.last_hidden_state, dim=1, keepdim=False)
    current_fea = mean_pool_fea.detach().cpu()
    if i == 0:
        all_feas = current_fea
    else:
        all_feas = torch.cat(tensors=(all_feas, current_fea), dim=0)
        

100%|████████████████████████████████████████████████████████████████████████████| 46985/46985 [48:05<00:00, 16.28it/s]


In [7]:
all_feas_np = all_feas.numpy()

In [9]:
all_feas_np.shape

(46985, 768)

In [10]:
import numpy as np
np.save("draw_wos_data/wos_feas_1st_try.npy", all_feas_np)