In [2]:
import os 
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('__file__'))))

from transformers import BertModel, BertTokenizer

import torch
from kobert_transformers import get_kobert_model
from kobert_transformers import get_tokenizer

import torch

# 현재 Setup 되어있는 device 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

Available devices  2
Current cuda device  0
GeForce RTX 2080 Ti


In [7]:
# GPU 할당 변경하기
GPU_NUM = 1 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')

Current cuda device  1
GeForce GTX 1080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


# Test of Tokenizers

In [51]:
# Set tokenizers

from kobert_transformers import get_tokenizer
kobert_tokenizer = get_tokenizer()

kor_tokenizer = kobert_tokenizer.from_pretrained('monologg/kobert')
monologg_tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

eng_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
multi_uncased_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
multi_cased_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [59]:
kor_example_sentence1 = "[CLS] 우리 둘의 마지막 페이지를 잘 부탁해 [SEP]"
eng_example_sentence1 = "[CLS] Please take care of the last page of both of us. [SEP]"
kor_eng_example_sentence1 = "[CLS] Our 둘의 Last Page를 잘 부탁해 [SEP]"

kor_example_sentence2 = "[CLS] 처음 만난 그날처럼 예쁘다고 말해줄래 [SEP]"
eng_example_sentence2 = "[CLS] Can you tell me you're as pretty as the day we first met? [SEP]"
kor_eng_example_sentence2 = "[CLS] 처음 만난 that day처럼 pretty 말해줄래 [SEP]"

In [75]:
print(f'Korean Example Sentence1: {kor_example_sentence1}')
print(f'English Example Sentence1: {eng_example_sentence1}')
print(f'Kor & Eng Example Sentence1: {kor_eng_example_sentence1}')
print('')

print(f'Korean Example Sentence2: {kor_example_sentence2}')
print(f'English Example Sentence2: {eng_example_sentence2}')
print(f'Kor & Eng Example Sentence2: {kor_eng_example_sentence2}')
print('')

print('[Kobert Tokenizer]')
print(f'Korean1 > Kobert tokenizer: \n{kor_tokenizer.tokenize(kor_example_sentence1)}')
print(f'Korean2 > Kobert tokenizer: \n{kor_tokenizer.tokenize(kor_example_sentence2)}')
print(f'Kor & Eng2 > Kobert tokenizer: \n {kor_tokenizer.tokenize(kor_eng_example_sentence2)}')
print('='*100, '\n')

print('[monologg Tokenizer]')
print("Monologg Tokenizer doesn't work properly, needs Kobert Tokenizer to fix it")
print(f'Korean1 > monologg tokenizer: \n{monologg_tokenizer.tokenize(kor_example_sentence1)}')
print(f'Korean2 > monologg tokenizer: \n{monologg_tokenizer.tokenize(kor_example_sentence2)}')
print('='*100, '\n')

print('[Bert-base-uncased Tokenizer]')
print(f'English1 > English tokenizer: \n{eng_tokenizer.tokenize(eng_example_sentence1)}')
print(f'English2 > English tokenizer: \n{eng_tokenizer.tokenize(eng_example_sentence2)}')
print('='*100, '\n')

print('[Bert-base-uncased Tokenizer]')

print(f'Kor & Eng1 > English Tokenizer: \n {eng_tokenizer.tokenize(kor_eng_example_sentence1)}')
print(f'Kor & Eng2 > English Tokenizer: \n {eng_tokenizer.tokenize(kor_eng_example_sentence2)}')
print('='*100, '\n')

print('[Bert-base-multilingual-uncased Tokenizer]')
print(f'Korean1 > monologg tokenizer: \n{multi_uncased_tokenizer.tokenize(kor_example_sentence1)}')
print(f'English1 > English tokenizer: \n{multi_uncased_tokenizer.tokenize(eng_example_sentence1)}')

print(f'Kor & Eng1 > Multi (Uncased) Tokenizer: \n {multi_uncased_tokenizer.tokenize(kor_eng_example_sentence1)}')
print(f'Kor & Eng2 > Multi (Uncased) Tokenizer: \n {multi_uncased_tokenizer.tokenize(kor_eng_example_sentence2)}')

print('='*100, '\n')

print('[Bert-base-multilingual-cased Tokenizer]')
print(f'Korean2 > monologg tokenizer: \n{multi_cased_tokenizer.tokenize(kor_example_sentence2)}')
print(f'English2 > English tokenizer: \n{multi_cased_tokenizer.tokenize(eng_example_sentence2)}')

print(f'Kor & Eng1 > Multi (Cased) Tokenizer: \n {multi_cased_tokenizer.tokenize(kor_eng_example_sentence1)}')
print(f'Kor & Eng2 > Multi (Cased) Tokenizer: \n {multi_cased_tokenizer.tokenize(kor_eng_example_sentence2)}')

Korean Example Sentence1: [CLS] 우리 둘의 마지막 페이지를 잘 부탁해 [SEP]
English Example Sentence1: [CLS] Please take care of the last page of both of us. [SEP]
Kor & Eng Example Sentence1: [CLS] Our 둘의 Last Page를 잘 부탁해 [SEP]

Korean Example Sentence2: [CLS] 처음 만난 그날처럼 예쁘다고 말해줄래 [SEP]
English Example Sentence2: [CLS] Can you tell me you're as pretty as the day we first met? [SEP]
Kor & Eng Example Sentence2: [CLS] 처음 만난 that day처럼 pretty 말해줄래 [SEP]

[Kobert Tokenizer]
Korean1 > Kobert tokenizer: 
['[CLS]', '▁우리', '▁둘', '의', '▁마지막', '▁페', '이', '지를', '▁잘', '▁부탁', '해', '[SEP]']
Korean2 > Kobert tokenizer: 
['[CLS]', '▁처음', '▁만난', '▁그', '날', '처럼', '▁예쁘', '다', '고', '▁말해', '줄', '래', '[SEP]']
Kor & Eng2 > Kobert tokenizer: 
 ['[CLS]', '▁처음', '▁만난', '▁', 'th', 'at', '▁', 'd', 'ay', '처럼', '▁', 'p', 're', 't', 't', 'y', '▁말해', '줄', '래', '[SEP]']

[monologg Tokenizer]
Monologg Tokenizer doesn't work properly, needs Kobert Tokenizer to fix it
Korean1 > monologg tokenizer: 
['[CLS]', '우리', '[UNK]', '[UNK]', '[UN

# SQLite Execution

In [5]:
import sqlite3

In [38]:
con = sqlite3.connect('/repo/TabularSemanticParsing/data/wikisql1.1/dev.db')

cur = con.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")

In [None]:
print(cur.fetchall())

In [46]:
for row in cur.execute('SELECT * FROM table_2_17231267_1'):
    print(row)

('johnny miller', 'united states', '1973', 282.0, 2.0, 't4')
('hale irwin', 'united states', '1974 , 1979', 284.0, 4.0, '6')
('lee trevino', 'united states', '1968 , 1971', 286.0, 6.0, 't9')
('tom watson', 'united states', '1982', 287.0, 7.0, 't11')
('david graham', 'australia', '1981', 287.0, 7.0, 't11')
('jack nicklaus', 'united states', '1962 , 1967 , 1972 , 1980', 289.0, 9.0, 't21')
('hubert green', 'united states', '1977', 291.0, 11.0, 't30')
('gary player', 'south africa', '1965', 294.0, 14.0, 't43')


# Pickle file

In [1]:
import pickle
import sys

In [180]:
with open('./data/ko_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.pkl', 'rb') as f:
    engko_data = pickle.load(f)

In [181]:
with open('./data/ko_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.kobert.pkl', 'rb') as f:
    ko_data = pickle.load(f)

In [182]:
with open('./data/wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.pkl', 'rb') as f:
    eng_data = pickle.load(f)

In [8]:
with open('./data/multi_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.bert.multilingual.pkl', 'rb') as f:
    mult_data = pickle.load(f)

In [9]:
mult_data['train'][0].pretty_print()

NL: b'South Australia\xec\x9d\x98 notes\xec\x9d\xb4 \xeb\xad\x94\xec\xa7\x80 \xeb\xa7\x90\xed\x95\xb4\xec\xa4\x98'
NL tokens: [b'South', b'Australia', b'##\xec\x9d\x98', b'notes', b'##\xec\x9d\xb4', b'\xeb\xad\x94', b'##\xec\xa7\x80', b'\xeb\xa7\x90', b'##\xed\x95\xb4', b'##\xec\xa4\x98']
NL tokens (original): [b'South', b'Australia', b'##\xec\x9d\x98', b'notes', b'##\xec\x9d\xb4', b'\xeb\xad\x94', b'##\xec\xa7\x80', b'\xeb\xa7\x90', b'##\xed\x95\xb4', b'##\xec\xa4\x98']
['[CLS]', 'South', 'Australia', '##의', 'notes', '##이', '뭔', '##지', '말', '##해', '##줘', '[SEP]', '[unused52]', '[unused50]', 'State', '/', 'territory', '[unused51]', 'State', '/', 'territory', '[unused49]', 'South', 'Australia', '[unused51]', 'Text', '/', 'background', 'colour', '[unused51]', 'Format', '[unused51]', 'Current', 'slogan', '[unused51]', 'Current', 'series', '[unused51]', 'Notes', '[SEP]']
Target 0: b''
b"Target form: [{'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}]"



In [5]:
with open('./data/multi_wikisql1.1/wikisql.bridge.question-split.ppl-0.85.2.dn.no_from.eo.bert.multilingual.pkl', 'rb') as f:
    multi_data = pickle.load(f)

In [7]:
multi_data['train'][0].pretty_print()

NL: b'South Australia\xec\x9d\x98 notes\xec\x9d\xb4 \xeb\xad\x94\xec\xa7\x80 \xeb\xa7\x90\xed\x95\xb4\xec\xa4\x98'
NL tokens: [b'South', b'Australia', b'##\xec\x9d\x98', b'notes', b'##\xec\x9d\xb4', b'\xeb\xad\x94', b'##\xec\xa7\x80', b'\xeb\xa7\x90', b'##\xed\x95\xb4', b'##\xec\xa4\x98']
NL tokens (original): [b'South', b'Australia', b'##\xec\x9d\x98', b'notes', b'##\xec\x9d\xb4', b'\xeb\xad\x94', b'##\xec\xa7\x80', b'\xeb\xa7\x90', b'##\xed\x95\xb4', b'##\xec\xa4\x98']
['[CLS]', 'South', 'Australia', '##의', 'notes', '##이', '뭔', '##지', '말', '##해', '##줘', '[SEP]', '[unused52]', '[unused50]', 'State', '/', 'territory', '[unused51]', 'State', '/', 'territory', '[unused49]', 'South', 'Australia', '[unused51]', 'Text', '/', 'background', 'colour', '[unused51]', 'Format', '[unused51]', 'Current', 'slogan', '[unused51]', 'Current', 'series', '[unused51]', 'Notes', '[SEP]']
Target 0: b''
b"Target form: [{'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}]"



In [None]:
for i in range(50, 55):
    print('#'*20)
    print('-'*20)
    print("English BERT")
    eng_data['train'][i].pretty_print()
    print('-'*20)
    print("Eng+Kor BERT")
    engko_data['train'][i].pretty_print()
    print('-'*20)
    print("Korean BERT")
    ko_data['train'][i].pretty_print()
    print('-'*20)
    print("Multi-Lingual BERT")
    multi_data['train'][i].pretty_print()
    

# Attempts to resize the Embedding of BERT

In [145]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

In [146]:
tokenizer.tokenize('내 몸이 너무 아파')

['[UNK]', '[UNK]', '[UNK]', '[UNK]']

In [29]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

print(len(tokenizer))  # 28996
tokenizer.add_tokens(["NEW_TOKEN"])
print(len(tokenizer))  # 28997

model.resize_token_embeddings(len(tokenizer)) 
# The new vector is added at the end of the embedding matrix

print(model.embeddings.word_embeddings.weight[-1, :])
# Randomly generated matrix

model.embeddings.word_embeddings.weight[-1, :] = torch.zeros([model.config.hidden_size])

print(model.embeddings.word_embeddings.weight[-1, :])
# outputs a vector of zeros of shape [768]

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…


28996
28997
tensor([ 2.0105e-02,  4.1210e-02, -8.6391e-03, -1.2895e-02, -2.6548e-02,
         1.2749e-02, -2.8585e-02,  2.9357e-02, -2.8315e-02, -1.6660e-03,
        -2.8184e-02, -2.4176e-02, -1.1917e-02, -5.0954e-02, -5.3747e-03,
        -2.3604e-02,  1.5609e-02,  8.0757e-03, -1.1517e-02, -8.6824e-03,
         6.1110e-04,  1.4534e-02, -5.8289e-03,  1.3068e-02, -2.6396e-02,
        -1.5126e-02, -4.6017e-03, -3.7856e-03, -3.0076e-02,  2.6408e-02,
         4.2605e-03,  4.5580e-03, -1.6992e-02,  8.4462e-03,  1.6670e-02,
        -7.7514e-03, -3.0544e-02,  1.1164e-02,  8.2609e-04, -9.2539e-03,
         3.8989e-02, -2.1804e-02,  9.4779e-03,  1.6304e-03, -2.1960e-02,
        -1.8996e-02,  2.2673e-02,  9.4029e-03,  5.8221e-03, -1.2463e-02,
        -5.4119e-03,  2.4861e-02, -6.2257e-04,  4.0769e-02, -1.2550e-02,
         1.2010e-02, -1.3524e-02, -1.0534e-02, -2.9601e-02,  1.2069e-02,
        -2.6971e-02,  2.8995e-02,  7.6551e-03, -2.7225e-02, -2.7898e-02,
         3.7611e-03, -1.0652e-02, -2.4