In [1]:
!pip install transformers
import transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = False)

import tensorflow as tf
import numpy as np
import pandas as pd

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:0

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [2]:
dataset = pd.read_csv("/content/drive/MyDrive/Youtube_Scraped/cleaned_data.csv", index_col=False)
dataset.head()

Unnamed: 0,Titles,Target
0,김뿡,0
1,꼽줘버린 김뿡,0
2,분명 지누형 생일기념 탈출맵이라고 했는데 김뿡,0
3,2023 08 25 아웃라스트 트라이얼 w 지누 코렛트 탬탬버린,0
4,같은 파티원 유니온 상태 크아아아악 김뿡,1


In [3]:
dataset = dataset.sample(frac = 1)
dataset.head()

Unnamed: 0,Titles,Target
264,탬탬버린 메이플 조별과제 절망편,0
460,북한에 비둘기가 없는 이유ㄷㄷ 비둘기의 소름 돋는 비밀 TOP5 빠퀴2tv,1
193,글로벌K 밤마다 육교로 모이는 중국 라이브 스트리머들 경쟁 고조 KBS 2023 0...,0
80,픽셀모임에서 패드립 켠왕하는 사람 김뿡,1
517,이세계아이돌 ISEGYE IDOL LOCKDOWN 락다운 메르헨 Marchen,1


In [4]:
ten_perc = int(len(dataset) * 0.1)
train_titles = np.array(dataset["Titles"][:-ten_perc], dtype = str)
train_labels = np.array(dataset["Target"][:-ten_perc])

valid_titles = np.array(dataset["Titles"][-ten_perc:], dtype = str)
valid_labels = np.array(dataset["Target"][-ten_perc:])

train_titles[:5], valid_titles[:5]

(array(['탬탬버린 메이플 조별과제 절망편', '북한에 비둘기가 없는 이유ㄷㄷ 비둘기의 소름 돋는 비밀 TOP5 빠퀴2tv',
        '글로벌K 밤마다 육교로 모이는 중국 라이브 스트리머들 경쟁 고조 KBS 2023 03 01',
        '픽셀모임에서 패드립 켠왕하는 사람 김뿡',
        '이세계아이돌 ISEGYE IDOL LOCKDOWN 락다운 메르헨 Marchen'], dtype='<U97'),
 array(['상황극', '유리온실 다시 돌아온 유리의 게임 방송', '해커들 진짜 미친거 아님',
        '2023 06 22 저스트채팅 Just Chatting',
        '20230818 KIDDING 키딩 음원 발매 담력시험 왁굳님 시점으로 다시보기 아이네 다시보기'],
       dtype='<U92'))

In [5]:
i = 35
tokenized_data = tokenizer.encode_plus(train_titles[i],
                                       add_special_tokens = True,
                                       max_length = 30,
                                       pad_to_max_length = True,
                                       truncation = True)

tokenized_data = dict(tokenized_data)
labels = np.array(train_labels[i])



In [6]:
def bert_encode(titles, maximum_length):
  input_ids = []
  attention_masks = []

  for items in titles:
    encoded = tokenizer.encode_plus(items,
                                    add_special_tokens = True,
                                    max_length = maximum_length,
                                    pad_to_max_length = True,
                                    truncation = True)
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
  return np.array(input_ids), np.array(attention_masks)

In [7]:
train_input_ids, train_attention_masks = bert_encode(train_titles, 30)
valid_input_ids, valid_attention_masks = bert_encode(valid_titles, 30)

In [8]:
from tensorflow.keras.optimizers import Adam

def create_model(bert_model):
  input_ids = tf.keras.Input(shape=(30,), dtype='int32')
  attention_masks = tf.keras.Input(shape=(30,), dtype='int32')

  output = bert_model([input_ids, attention_masks])
  output = output[1]
  output = tf.keras.layers.Dense(32, activation = 'relu')(output)
  output = tf.keras.layers.Dropout(0.2)(output)
  output = tf.keras.layers.Dense(1, activation = 'sigmoid')(output)

  model = tf.keras.models.Model(inputs = [input_ids, attention_masks], outputs = output)
  model.compile(Adam(learning_rate = 0.00001), loss = 'binary_crossentropy', metrics = ['accuracy'])

  return model

In [9]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
model = create_model(bert_model)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 30,                                            

In [11]:
history = model.fit([train_input_ids, train_attention_masks], train_labels,
                    validation_split=0.2, epochs = 3, batch_size = 32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
model.evaluate([valid_input_ids, valid_attention_masks], valid_labels)



[0.6856147646903992, 0.4615384638309479]

In [14]:
a = model.predict([valid_input_ids, valid_attention_masks])
a= tf.squeeze(a)



In [15]:
model.save("bert_model_v1.h5")

In [16]:
from google.colab import files
files.download("bert_model_v1.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>