In [35]:
import os 
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('__file__'))))

In [34]:
from transformers import BertModel, BertTokenizer

In [32]:
import torch
from kobert_transformers import get_kobert_model
from kobert_transformers import get_tokenizer

In [2]:
import torch

# 현재 Setup 되어있는 device 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

Available devices  2
Current cuda device  0
GeForce RTX 2080 Ti


In [7]:
# GPU 할당 변경하기
GPU_NUM = 1 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')

Current cuda device  1
GeForce GTX 1080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


# Korean Tokenizer

In [35]:
kor_tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
eng_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
multi_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [49]:
kor_tokenizer.convert_tokens_to_ids('[unused51]')

0

In [52]:
kor_tokenizer.convert_tokens_to_ids('[unused58]')

8010

In [53]:
for index, value in enumerate(range(50, 66)):
    kor_tokenizer.vocab[f'[unused{value}]']=8002+index

In [44]:
kor_tokenizer.vocab['[unused50]']

8002

In [23]:
eng_tokenizer.vocab['[unused1]']

2

In [37]:
multi_tokenizer.vocab['[unused2]']

2

In [31]:
len(kor_tokenizer)

8002

In [29]:
dict(filter(lambda elem:elem[1]>8000, kor_tokenizer.vocab.items()))

{'힙': 8001}

In [22]:
kor_tokenizer.vocab['[unused53]']

KeyError: '[unused53]'

In [10]:
kor_tokenizer.convert_tokens_to_ids(kor_tokenizer.pad_token)

1

In [None]:
table_marker = '[unused50]'
field_marker = '[unused51]'
value_marker = '[unused49]'
asterisk_marker = '[unused52]'
# asterisk_marker = '*'
primary_key_marker = '[unused53]'
foreign_key_marker = '[unused54]'
foreign_key_ref_table_marker = '[unused55]'
foreign_key_ref_field_marker = '[unused56]'
table_marker_id = bt.convert_tokens_to_ids(table_marker)

In [18]:
eng_tokenizer.added_tokens_encoder

{}

In [11]:
kor_tokenizer.unique_added_tokens_encoder

{'[CLS]', '[MASK]', '[PAD]', '[SEP]', '[UNK]'}

In [62]:
kor_tokenizer.tokenize("[CLS] 나는 바나나 주스랑 멜론 아이스크림을 좋아해 [SEP]")

['[CLS]',
 '▁나는',
 '▁바',
 '나',
 '나',
 '▁주',
 '스',
 '랑',
 '▁',
 '멜',
 '론',
 '▁아이',
 '스크',
 '림',
 '을',
 '▁좋아',
 '해',
 '[SEP]']

In [61]:
kor_tokenizer.tokenize("[CLS] i love banana [SEP]")

['[CLS]', '▁', 'i', '▁', 'lo', 'v', 'e', '▁', 'b', 'an', 'an', 'a', '[SEP]']

In [26]:
eng_tokenizer.tokenize("[CLS] i love banana juice and melon icecream [SEP]")

['[CLS]',
 'i',
 'love',
 'banana',
 'juice',
 'and',
 'mel',
 '##on',
 'ice',
 '##cre',
 '##am',
 '[SEP]']

In [27]:
multi_tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")

['[CLS]',
 '한국',
 '##어',
 'ᄆ',
 '##ᅩ',
 '##데',
 '##ᆯ을',
 '고',
 '##ᆼ',
 '##유',
 '##합',
 '##니다',
 '.',
 '[SEP]']

In [28]:
multi_tokenizer.tokenize("[CLS] i love banana juice and melon icecream [SEP]")

['[CLS]',
 'i',
 'love',
 'banana',
 'juice',
 'and',
 'melo',
 '##n',
 'ice',
 '##cre',
 '##am',
 '[SEP]']

In [67]:
kor_tokenizer.convert

PreTrainedTokenizer(name_or_path='monologg/kobert', vocab_size=8002, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# SQLite Execution

In [5]:
import sqlite3

In [38]:
con = sqlite3.connect('/repo/TabularSemanticParsing/data/wikisql1.1/dev.db')

In [39]:
cur = con.cursor()

In [42]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")

<sqlite3.Cursor at 0x7fefb8b54b90>

In [43]:
print(cur.fetchall())

41438_1',), ('table_2_1604842_1',), ('table_2_1604940_12',), ('table_2_16050349_10',), ('table_2_16050349_14',), ('table_2_16050349_6',), ('table_2_160510_5',), ('table_2_16056990_12',), ('table_2_16066729_1',), ('table_2_16067721_1',), ('table_2_16070554_1',), ('table_2_16078390_1',), ('table_2_1608306_4',), ('table_2_16090138_1',), ('table_2_160994_2',), ('table_2_16100029_2',), ('table_2_16100029_3',), ('table_2_1610301_1',), ('table_2_1615758_2',), ('table_2_16181680_3',), ('table_2_16183862_1',), ('table_2_16185580_1',), ('table_2_1618638_2',), ('table_2_16194551_5',), ('table_2_161972_2',), ('table_2_161972_6',), ('table_2_16215078_1',), ('table_2_162342_2',), ('table_2_16234974_1',), ('table_2_16236714_2',), ('table_2_1625631_1',), ('table_2_1625862_8',), ('table_2_16270492_2',), ('table_2_16275569_1',), ('table_2_16279834_3',), ('table_2_1628307_5',), ('table_2_16285899_1',), ('table_2_1629086_5',), ('table_2_16292316_4',), ('table_2_16293403_29',), ('table_2_16295105_1',), ('t

In [46]:
for row in cur.execute('SELECT * FROM table_2_17231267_1'):
    print(row)

('johnny miller', 'united states', '1973', 282.0, 2.0, 't4')
('hale irwin', 'united states', '1974 , 1979', 284.0, 4.0, '6')
('lee trevino', 'united states', '1968 , 1971', 286.0, 6.0, 't9')
('tom watson', 'united states', '1982', 287.0, 7.0, 't11')
('david graham', 'australia', '1981', 287.0, 7.0, 't11')
('jack nicklaus', 'united states', '1962 , 1967 , 1972 , 1980', 289.0, 9.0, 't21')
('hubert green', 'united states', '1977', 291.0, 11.0, 't30')
('gary player', 'south africa', '1965', 294.0, 14.0, 't43')


# Pickle file

In [24]:
import pickle
import sys

In [25]:
pwd

'/repo/SPARTA/code'

In [180]:
with open('./data/ko_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.pkl', 'rb') as f:
    engko_data = pickle.load(f)

In [181]:
with open('./data/ko_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.kobert.pkl', 'rb') as f:
    ko_data = pickle.load(f)

In [182]:
with open('./data/wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.pkl', 'rb') as f:
    eng_data = pickle.load(f)

In [183]:
with open('./data/multi_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.pkl', 'rb') as f:
    multi_data = pickle.load(f)

In [186]:
for i in range(50, 55):
    print('#'*20)
    print('-'*20)
    print("English BERT")
    eng_data['train'][i].pretty_print()
    print('-'*20)
    print("Eng+Kor BERT")
    engko_data['train'][i].pretty_print()
    print('-'*20)
    print("Korean BERT")
    ko_data['train'][i].pretty_print()
    print('-'*20)
    print("Multi-Lingual BERT")
    multi_data['train'][i].pretty_print()
    

####################
--------------------
English BERT
NL: b"what's the\xc2\xa0singles w-l\xc2\xa0for kim doo-hwan"
NL tokens: [b'what', b"'", b's', b'the', b'singles', b'w', b'-', b'l', b'for', b'kim', b'doo', b'-', b'h', b'##wan']
NL tokens (original): [b'what', b"'", b's', b'the', b'singles', b'w', b'-', b'l', b'for', b'kim', b'doo', b'-', b'h', b'##wan']
['[CLS]', 'what', "'", 's', 'the', 'singles', 'w', '-', 'l', 'for', 'kim', 'doo', '-', 'h', '##wan', '[SEP]', '[unused52]', '[unused50]', 'squad', 'members', '[unused51]', 'player', '[unused49]', 'kim', 'doo', '-', 'h', '##wan', '[unused51]', 'years', 'played', '[unused51]', 'total', 'w', '-', 'l', '[unused51]', 'singles', 'w', '-', 'l', '[unused51]', 'doubles', 'w', '-', 'l', '[SEP]']
Target 0: b''
b"Target form: [{'sel': 3, 'conds': [[0, 0, 'Kim Doo-Hwan']], 'agg': 0}]"

--------------------
Eng+Kor BERT
NL: b'\xea\xb9\x80\xeb\x91\x90\xed\x99\x98\xec\x9d\x98 \xec\x8b\xb1\xea\xb8\x80\xec\x9d\x80 \xeb\xad\x90\xec\x95\xbc'
NL tokens

In [177]:
engko_data['dev'][100].pretty_print()

NL: b'\xeb\xa7\x88\xec\x9d\xb4\xec\x95\xa0\xeb\xaf\xb8\xea\xb0\x80 \xec\xb2\xab \xeb\xb2\x88\xec\xa7\xb8 \xec\xa4\x80\xea\xb2\xb0\xec\x8a\xb9 \xec\xa7\x84\xec\xb6\x9c \xec\x9e\x90\xeb\xa1\x9c \xec\x84\xa0\xec\xa0\x95\xeb\x90\x98\xec\x97\x88\xec\x9d\x84 \xeb\x95\x8c \xeb\xaa\xa8\xeb\x93\xa0 \xea\xb2\xbd\xea\xb8\xb0\xec\x9d\x98 \xec\xa0\x90\xec\x88\x98\xeb\xa5\xbc \xea\xb8\xb0\xec\x9e\xac\xed\x95\x98\xec\x8b\xad\xec\x8b\x9c\xec\x98\xa4.'
NL tokens: [b'\xeb\xa7\x88', b'\xec\x9d\xb4', b'\xec\x95\xa0', b'\xeb\xaf\xb8', b'\xea\xb0\x80', b'\xec\xb2\xab', b'\xeb\xb2\x88', b'\xec\xa7\xb8', b'\xec\xa4\x80', b'\xea\xb2\xb0', b'\xec\x8a\xb9', b'\xec\xa7\x84', b'\xec\xb6\x9c', b'\xec\x9e\x90', b'\xeb\xa1\x9c', b'\xec\x84\xa0', b'\xec\xa0\x95', b'\xeb\x90\x98\xec\x97\x88', b'\xec\x9d\x84', b'\xeb\x95\x8c', b'\xeb\xaa\xa8', b'\xeb\x93\xa0', b'\xea\xb2\xbd', b'\xea\xb8\xb0', b'\xec\x9d\x98', b'\xec\xa0\x90', b'\xec\x88\x98\xeb\xa5\xbc', b'\xea\xb8\xb0', b'\xec\x9e\xac', b'\xed\x95\x98', b'\xec\x8b\xad

In [178]:
ko_data['dev'][100].pretty_print()

NL: b'\xeb\xa7\x88\xec\x9d\xb4\xec\x95\xa0\xeb\xaf\xb8\xea\xb0\x80 \xec\xb2\xab \xeb\xb2\x88\xec\xa7\xb8 \xec\xa4\x80\xea\xb2\xb0\xec\x8a\xb9 \xec\xa7\x84\xec\xb6\x9c \xec\x9e\x90\xeb\xa1\x9c \xec\x84\xa0\xec\xa0\x95\xeb\x90\x98\xec\x97\x88\xec\x9d\x84 \xeb\x95\x8c \xeb\xaa\xa8\xeb\x93\xa0 \xea\xb2\xbd\xea\xb8\xb0\xec\x9d\x98 \xec\xa0\x90\xec\x88\x98\xeb\xa5\xbc \xea\xb8\xb0\xec\x9e\xac\xed\x95\x98\xec\x8b\xad\xec\x8b\x9c\xec\x98\xa4.'
NL tokens: [b'[UNK]', b'\xec\xb2\xab', b'\xeb\xb2\x88\xec\xa7\xb8', b'[UNK]', b'[UNK]', b'\xec\x9e\x90\xeb\xa1\x9c', b'[UNK]', b'\xeb\x95\x8c', b'[UNK]', b'[UNK]', b'[UNK]', b'[UNK]', b'.']
NL tokens (original): [b'\xeb\xa7\x88', b'\xec\x9d\xb4', b'\xec\x95\xa0\xeb\xaf\xb8', b'\xea\xb0\x80', b'\xec\xb2\xab', b'\xeb\xb2\x88\xec\xa7\xb8', b'\xec\xa4\x80', b'\xea\xb2\xb0', b'\xec\x8a\xb9', b'\xec\xa7\x84', b'\xec\xb6\x9c', b'\xec\x9e\x90', b'\xeb\xa1\x9c']
['[CLS]', '[UNK]', '첫', '번째', '[UNK]', '[UNK]', '자로', '[UNK]', '때', '[UNK]', '[UNK]', '[UNK]', '[UNK]'

In [179]:
data['train'][100].pretty_print()

NameError: name 'data' is not defined

In [57]:
multi_data['train'][0].pretty_print()

NL: b'\xeb\x82\xa8\xed\x98\xb8\xec\xa3\xbc\xec\x97\x90 \xeb\x8c\x80\xed\x95\x9c \xeb\xa9\x94\xeb\xaa\xa8\xea\xb0\x80 \xeb\xac\xb4\xec\x97\x87\xec\x9d\xb8\xec\xa7\x80 \xec\x95\x8c\xeb\xa0\xa4\xec\xa3\xbc\xec\x84\xb8\xec\x9a\x94.'
NL tokens: [b'\xe1\x84\x82', b'##\xe1\x85\xa1\xe1\x86\xb7', b'##\xe1\x84\x92\xe1\x85\xa9', b'##\xe1\x84\x8c\xe1\x85\xae', b'##\xe1\x84\x8b\xe1\x85\xa6', b'\xe1\x84\x83\xe1\x85\xa2\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab', b'\xe1\x84\x86', b'##\xe1\x85\xa6', b'##\xe1\x84\x86\xe1\x85\xa9', b'##\xe1\x84\x80\xe1\x85\xa1', b'\xe1\x84\x86', b'##\xe1\x85\xae\xe1\x84\x8b\xe1\x85\xa5', b'##\xe1\x86\xba\xe1\x84\x8b\xe1\x85\xb5', b'##\xe1\x86\xab', b'##\xe1\x84\x8c\xe1\x85\xb5', b'\xe1\x84\x8b\xe1\x85\xa1\xe1\x86\xaf', b'##\xe1\x84\x85\xe1\x85\xa7', b'##\xe1\x84\x8c\xe1\x85\xae', b'##\xe1\x84\x89\xe1\x85\xa6', b'##\xe1\x84\x8b\xe1\x85\xad', b'.']
NL tokens (original): [b'\xe1\x84\x82', b'##\xe1\x85\xa1\xe1\x86\xb7', b'##\xe1\x84\x92\xe1\x85\xa9', b'##\xe1\x84\x8c\xe1\x85\xae'

In [145]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

In [146]:
tokenizer.tokenize('내 몸이 너무 아파')

['[UNK]', '[UNK]', '[UNK]', '[UNK]']

In [29]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

print(len(tokenizer))  # 28996
tokenizer.add_tokens(["NEW_TOKEN"])
print(len(tokenizer))  # 28997

model.resize_token_embeddings(len(tokenizer)) 
# The new vector is added at the end of the embedding matrix

print(model.embeddings.word_embeddings.weight[-1, :])
# Randomly generated matrix

model.embeddings.word_embeddings.weight[-1, :] = torch.zeros([model.config.hidden_size])

print(model.embeddings.word_embeddings.weight[-1, :])
# outputs a vector of zeros of shape [768]

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…


28996
28997
tensor([ 2.0105e-02,  4.1210e-02, -8.6391e-03, -1.2895e-02, -2.6548e-02,
         1.2749e-02, -2.8585e-02,  2.9357e-02, -2.8315e-02, -1.6660e-03,
        -2.8184e-02, -2.4176e-02, -1.1917e-02, -5.0954e-02, -5.3747e-03,
        -2.3604e-02,  1.5609e-02,  8.0757e-03, -1.1517e-02, -8.6824e-03,
         6.1110e-04,  1.4534e-02, -5.8289e-03,  1.3068e-02, -2.6396e-02,
        -1.5126e-02, -4.6017e-03, -3.7856e-03, -3.0076e-02,  2.6408e-02,
         4.2605e-03,  4.5580e-03, -1.6992e-02,  8.4462e-03,  1.6670e-02,
        -7.7514e-03, -3.0544e-02,  1.1164e-02,  8.2609e-04, -9.2539e-03,
         3.8989e-02, -2.1804e-02,  9.4779e-03,  1.6304e-03, -2.1960e-02,
        -1.8996e-02,  2.2673e-02,  9.4029e-03,  5.8221e-03, -1.2463e-02,
        -5.4119e-03,  2.4861e-02, -6.2257e-04,  4.0769e-02, -1.2550e-02,
         1.2010e-02, -1.3524e-02, -1.0534e-02, -2.9601e-02,  1.2069e-02,
        -2.6971e-02,  2.8995e-02,  7.6551e-03, -2.7225e-02, -2.7898e-02,
         3.7611e-03, -1.0652e-02, -2.4

In [119]:
kor_tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

In [166]:
len(tokenizer.vocab)

36615

In [94]:
len(kor_tokenizer)

8002

In [120]:
kor_words = list(kor_tokenizer.vocab.keys())

In [88]:
tokenizer.vocab['air']

1586

In [167]:
for kw in kor_words:
    if kw not in tokenizer.vocab:
        if '▁' in kw:
            kw = kw.replace('▁', '#')
        tokenizer.add_tokens([kw])
        tokenizer.vocab[kw] = len(tokenizer.vocab)
# model.resize_token_embeddings(len(tokenizer)) 


In [168]:
len(tokenizer.vocab)

36615

In [156]:
tokenizer.tokenize('사랑할 때마다')

['사', '랑', '할', '때', '마다']

In [123]:
tokenizer.save_pretrained('.')

('./vocab.txt', './special_tokens_map.json', './added_tokens.json')

In [125]:
len(tokenizer.vocab)

28996

TypeError: object of type 'BertModel' has no len()