In [1]:
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
import tensorflow_hub as hub

In [2]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = 3

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

In [6]:
elmo = hub.load('https://tfhub.dev/google/elmo/2')

In [7]:
elmo

<tensorflow.python.training.tracking.tracking.AutoTrackable at 0x22ecc139fa0>

In [3]:
import torch
torch.manual_seed(0)
from transformers import BertTokenizer, BertModel

import logging
import matplotlib.pyplot as plt

In [4]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1', do_lower_case=True)

In [57]:
import pandas as pd
df_full = pd.read_csv('entailment_dataset_1.csv')
df = df_full.iloc[[0,1,2,18,19,20,60, 61, 62, 63,64,65],:]

In [59]:
for row in df['Kalimat Premis']:
    print(row)

Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan
Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan
Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan
1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. 
1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. 
1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. 
Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh
Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh
Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh
Kedatangan  Jepang  di  Indonesia  pada  awalnya  disambut  dengan  senang hati  oleh  rakyat  Indonesia. 
Kedatangan  Je

In [120]:
sentences = [row for row in df['Kalimat Premis']]
sentlabels = [row for row in df['Kalimat Hypothesis']]
sentences += sentlabels
sentences.append("Kedatangan  Jepang  di  Indonesia  pada  awalnya  disambut  dengan  senang hati  oleh  rakyat  di negara ini")
# Print the original sentence.
print(' Teks Asli: ', sentences[12][:99])
# Print the sentence splitted into tokens.
print('Tokenisasi: ', tokenizer.tokenize(sentences[0])[:15])
# Print the sentence mapped to token ids.
print('     Hasil: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))[:15])

 Teks Asli:  Devano adalah seorang pembunuh
Tokenisasi:  ['tiga', 'kali', 'dev', '##ano', 'menikah', ',', 'dan', 'ketiga', 'istrinya', 'itu', 'pun', 'meninggal', 'dengan', 'mengen', '##askan']
     Hasil:  [1224, 633, 5780, 13126, 3335, 30468, 41, 2243, 6341, 137, 573, 1851, 79, 753, 1656]


In [121]:
# Tokenize all of the sentences and map tokens to word IDs.
input_ids = []
attention_masks = []
tokenized_texts = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True,
                        truncation=True,
                        max_length = 48,          
                        pad_to_max_length = True,                        
                        return_tensors = 'pt',    
                   )
    # Save tokens from sentence as a separate array. 
    marked_text = "[CLS] " + sent + " [SEP]"
    tokenized_texts.append(tokenizer.tokenize(marked_text))
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
# Convert the list into tensor.
input_ids = torch.cat(input_ids, dim=0)



In [122]:
tokenized_texts

[['[CLS]',
  'tiga',
  'kali',
  'dev',
  '##ano',
  'menikah',
  ',',
  'dan',
  'ketiga',
  'istrinya',
  'itu',
  'pun',
  'meninggal',
  'dengan',
  'mengen',
  '##askan',
  '[SEP]'],
 ['[CLS]',
  'tiga',
  'kali',
  'dev',
  '##ano',
  'menikah',
  ',',
  'dan',
  'ketiga',
  'istrinya',
  'itu',
  'pun',
  'meninggal',
  'dengan',
  'mengen',
  '##askan',
  '[SEP]'],
 ['[CLS]',
  'tiga',
  'kali',
  'dev',
  '##ano',
  'menikah',
  ',',
  'dan',
  'ketiga',
  'istrinya',
  'itu',
  'pun',
  'meninggal',
  'dengan',
  'mengen',
  '##askan',
  '[SEP]'],
 ['[CLS]',
  '1942',
  ',',
  'adalah',
  'kali',
  'terakhir',
  'kami',
  'melihat',
  'orang',
  'orang',
  'itu',
  'menginjak',
  '##an',
  'kaki',
  'di',
  'tanah',
  'ini',
  '.',
  '[SEP]'],
 ['[CLS]',
  '1942',
  ',',
  'adalah',
  'kali',
  'terakhir',
  'kami',
  'melihat',
  'orang',
  'orang',
  'itu',
  'menginjak',
  '##an',
  'kaki',
  'di',
  'tanah',
  'ini',
  '.',
  '[SEP]'],
 ['[CLS]',
  '1942',
  ',',
  'adala

In [123]:
segments_ids = torch.ones_like(input_ids)
segments_ids.shape

torch.Size([25, 48])

In [124]:
model = BertModel.from_pretrained('indobenchmark/indobert-base-p1',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
model.eval();

In [125]:
with torch.no_grad():

    outputs = model(input_ids, segments_ids)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [126]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
print ("Number of batches:", len(hidden_states[0]))
print ("Number of tokens:", len(hidden_states[0][0]))
print ("Number of hidden units:", len(hidden_states[0][0][0]))


Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 25
Number of tokens: 48
Number of hidden units: 768


In [127]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 25, 48, 768])

In [128]:
# Swap dimensions, so we get tensors in format: [sentence, tokens, hidden layes, features]
token_embeddings = token_embeddings.permute(1,2,0,3)

token_embeddings.size()

torch.Size([25, 48, 13, 768])

In [129]:
# we will use last four hidden layers to create each word embedding

processed_embeddings = token_embeddings[:, :, 9:, :]
processed_embeddings.shape

torch.Size([25, 48, 4, 768])

In [130]:
# Concatenate four layers for each token to create embeddings

embeddings = torch.reshape(processed_embeddings, (4, 48, -1))
embeddings.shape

torch.Size([4, 48, 19200])

In [131]:
embeddings_cpy = processed_embeddings.clone().detach()

In [132]:
embeddings_cpy = embeddings_cpy.numpy()

In [133]:
embeddings_cpy

array([[[[-4.42338586e-01,  2.27511024e+00, -7.52650388e-03, ...,
          -2.82732517e-01,  1.43674001e-01, -6.40653431e-01],
         [-3.40156525e-01,  2.73039508e+00,  4.70301539e-01, ...,
          -1.63167521e-01,  1.93777934e-01, -6.59845889e-01],
         [-5.74045777e-01,  3.08830452e+00,  8.98605466e-01, ...,
           1.63299337e-01,  5.62301576e-01, -6.82644129e-01],
         [-4.94618833e-01,  3.40718126e+00,  1.25532389e+00, ...,
           7.20247179e-02,  1.03767514e+00, -7.66070426e-01]],

        [[-1.02166152e+00,  2.43172073e+00,  1.80502164e+00, ...,
          -1.07839978e+00,  3.92421782e-01, -2.00660634e+00],
         [-6.65422022e-01,  1.87048113e+00,  2.03179717e+00, ...,
          -1.08048499e+00,  4.05625403e-02, -1.77932096e+00],
         [-6.31544292e-01,  1.85545111e+00,  1.91295850e+00, ...,
          -7.45165765e-01,  3.57089877e-01, -1.52623343e+00],
         [-3.37856174e-01,  1.77555394e+00,  2.16683531e+00, ...,
          -7.10245311e-01,  5.566063

In [134]:
df_embed_full = {'teks':[sent for sent in sentences], 'embedding_result':[embed for embed in embeddings_cpy]}

In [135]:
sentences

['Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan',
 'Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan',
 'Tiga kali Devano menikah, dan ketiga istrinya itu pun meninggal dengan mengenaskan',
 '1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. ',
 '1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. ',
 '1942, adalah kali terakhir kami melihat orang orang itu menginjakan kaki di tanah ini. ',
 'Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh',
 'Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh',
 'Pada  masa  pendudukan  Jepang,  Gua  Jepang  digunakan  sebagai  benteng perlindungan tentara Jepang dari serangan musuh',
 'Kedatangan  Jepang  di  Indonesia  pada  awalnya  disambut  dengan  senang hati  ol

In [136]:
print(len(sentences))
print(len(tokenized_texts))
print(len(embeddings_cpy))
print(len([tok for tok in [tokenizer.tokenize(sent)[:15] for sent in sentences]]))
print(len([emb for emb in [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))[:15] for sent in sentences]]))

25
25
25
25
25


In [137]:
df_embed = {'teks':[sent for sent in sentences], 'tokenization_result':[tok for tok in [tokenizer.tokenize(sent)[:15] for sent in sentences]], 'vector_repr':[emb for emb in [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))[:15] for sent in sentences]],'embedding_result':[embed for embed in embeddings_cpy]}

In [138]:
df_embed = pd.DataFrame(df_embed)

In [139]:
df_embed

Unnamed: 0,teks,tokenization_result,vector_repr,embedding_result
0,"Tiga kali Devano menikah, dan ketiga istrinya ...","[tiga, kali, dev, ##ano, menikah, ,, dan, keti...","[1224, 633, 5780, 13126, 3335, 30468, 41, 2243...","[[[-0.4423386, 2.2751102, -0.007526504, 0.5282..."
1,"Tiga kali Devano menikah, dan ketiga istrinya ...","[tiga, kali, dev, ##ano, menikah, ,, dan, keti...","[1224, 633, 5780, 13126, 3335, 30468, 41, 2243...","[[[-0.4423386, 2.2751102, -0.007526504, 0.5282..."
2,"Tiga kali Devano menikah, dan ketiga istrinya ...","[tiga, kali, dev, ##ano, menikah, ,, dan, keti...","[1224, 633, 5780, 13126, 3335, 30468, 41, 2243...","[[[-0.4423386, 2.2751102, -0.007526504, 0.5282..."
3,"1942, adalah kali terakhir kami melihat orang ...","[1942, ,, adalah, kali, terakhir, kami, meliha...","[19298, 30468, 154, 633, 1668, 321, 722, 232, ...","[[[-0.58352745, 2.300963, 0.013439208, 0.52970..."
4,"1942, adalah kali terakhir kami melihat orang ...","[1942, ,, adalah, kali, terakhir, kami, meliha...","[19298, 30468, 154, 633, 1668, 321, 722, 232, ...","[[[-0.58352745, 2.300963, 0.013439208, 0.52970..."
5,"1942, adalah kali terakhir kami melihat orang ...","[1942, ,, adalah, kali, terakhir, kami, meliha...","[19298, 30468, 154, 633, 1668, 321, 722, 232, ...","[[[-0.58352745, 2.300963, 0.013439208, 0.52970..."
6,"Pada masa pendudukan Jepang, Gua Jepang ...","[pada, masa, pendudukan, jepang, ,, gua, jepan...","[126, 890, 18540, 1794, 30468, 4599, 1794, 781...","[[[-0.8451569, 2.219568, 0.20599273, 0.7148295..."
7,"Pada masa pendudukan Jepang, Gua Jepang ...","[pada, masa, pendudukan, jepang, ,, gua, jepan...","[126, 890, 18540, 1794, 30468, 4599, 1794, 781...","[[[-0.8451569, 2.219568, 0.20599273, 0.7148295..."
8,"Pada masa pendudukan Jepang, Gua Jepang ...","[pada, masa, pendudukan, jepang, ,, gua, jepan...","[126, 890, 18540, 1794, 30468, 4599, 1794, 781...","[[[-0.8451569, 2.219568, 0.20599273, 0.7148295..."
9,Kedatangan Jepang di Indonesia pada awaln...,"[kedatangan, jepang, di, indonesia, pada, awal...","[6570, 1794, 26, 300, 126, 3517, 11562, 79, 30...","[[[-0.6347034, 2.2512827, 0.24412957, 0.653939..."


In [140]:
df_embed.to_csv('io_steps.csv')
df_embed.to_excel('io_steps.xlsx')

In [118]:
from tensorflow.keras.layers import Bidirectional, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, GRU, Add, Reshape

In [None]:
x = Bidirectional(LSTM(128, return_sequences=True,recurrent_dropout=0.2))(x)

In [43]:
for i, token_str in enumerate(tokenized_texts[0]):
  print (i, token_str)

0 [CLS]
1 kedatangan
2 jepang
3 di
4 indonesia
5 pada
6 awalnya
7 disambut
8 dengan
9 senang
10 hati
11 oleh
12 rakyat
13 di
14 negara
15 ini
16 [SEP]


In [47]:
embeddings[0]

tensor([[-0.6382,  2.3011,  0.3072,  ..., -0.1035,  0.2020, -0.4965],
        [-0.3931,  2.2726,  1.0264,  ...,  0.1639,  0.3706, -0.3244],
        [-0.6397,  2.6094,  1.4012,  ...,  0.5967,  0.7286, -0.4166],
        ...,
        [ 0.6836,  1.1404, -0.3435,  ...,  1.4095, -0.3585,  0.7584],
        [ 0.7307,  1.6367, -0.5175,  ...,  1.6511,  0.1349,  1.0184],
        [ 0.7536,  1.6028, -0.1418,  ...,  1.5575,  0.5187,  1.1343]])

In [55]:
from scipy.spatial.distance import cosine

kings = cosine(embeddings[0][2], embeddings[0][4])
king_table = cosine(embeddings[0][4], embeddings[0][14])
king_archtur = cosine(embeddings[0][12], embeddings[0][4])

print('Jarak kata jepang dan indonesia:  %.2f' % kings)
print('Jarak kata indonesia dan negara:  %.2f' % king_table)
print('Jarak kata rakyat dan indonesia:  %.2f' % king_archtur)

Jarak kata jepang dan indonesia:  0.71
Jarak kata indonesia dan negara:  0.68
Jarak kata rakyat dan indonesia:  0.65
