# BERT: As one of Autoencoding Language Models 

In [None]:
# run in colab, if locally using, don't run this part

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
!pip install tokenizers

In [None]:
os.chdir("drive/MyDrive/akademi/Packt NLP with Transformers/CH03")

In [None]:
os.listdir()

In [1]:
# continue from here if using local

In [None]:
import pandas as pd
imdb_df = pd.read_csv("IMDB Dataset.csv")
reviews = imdb_df.review.to_string(index=None) 
with open("corpus.txt", "w") as f: 
    f.writelines(reviews) 

In [None]:
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
bert_wordpiece_tokenizer.train("corpus.txt") 

In [None]:
bert_wordpiece_tokenizer.get_vocab()

In [None]:
!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

In [None]:
tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [None]:
tokenized_sentence = tokenizer.encode("Oh it works just fine")

In [None]:
tokenized_sentence.tokens

In [None]:
tokenized_sentence = tokenizer.encode("ohoh i thougt it might be workingg well")

In [None]:
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("tokenizer") 

In [None]:
from transformers import LineByLineTextDataset 
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path="corpus.txt", block_size=128) 

In [None]:
from transformers import DataCollatorForLanguageModeling 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) 

In [None]:
from transformers import TrainingArguments 
training_args = TrainingArguments(output_dir="BERT", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=128) 

In [None]:
from transformers import BertConfig, BertForMaskedLM 
bert = BertForMaskedLM(BertConfig()) 

In [None]:
from transformers import Trainer 
trainer = Trainer(model=bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 

In [None]:
trainer.train()

In [None]:
trainer.save_model("MyBERT")

In [None]:
from transformers import BertConfig 
BertConfig() 

In [None]:
tiny_bert_config = BertConfig(max_position_embeddings=512, hidden_size=128, num_attention_heads=2, num_hidden_layers=2, intermediate_size=512) 
tiny_bert_config 

In [None]:
tiny_bert = BertForMaskedLM(tiny_bert_config) 
trainer = Trainer(model=tiny_bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 
trainer.train() 

In [None]:
from transformers import TFBertModel, BertTokenizerFast 
bert = TFBertModel.from_pretrained("bert-base-uncased") 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
bert.layers 

In [None]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","lets test it"], return_tensors="tf", max_length=256, truncation=True, pad_to_max_length=True) 
bert(tokenized_text) 

In [None]:
from tensorflow import keras 
import tensorflow as tf 
max_length = 256 
tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
embedding_layer = bert.layers[0]([tokens,masks])[0][:,0,:] 
dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer) 
model = keras.Model([tokens,masks],dense) 

In [None]:
tokenized = tokenizer.batch_encode_plus(["hello how is it going with you","hello how is it going with you"], return_tensors="tf", max_length= max_length, truncation=True, pad_to_max_length=True) 

In [None]:
model([tokenized["input_ids"],tokenized["attention_mask"]]) 

In [None]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"]) 
model.summary() 

In [None]:
model.layers[2].trainable = False 

In [None]:
import pandas as pd 
imdb_df = pd.read_csv("IMDB Dataset.csv") 
reviews = list(imdb_df.review) 
tokenized_reviews = tokenizer.batch_encode_plus(reviews, return_tensors="tf", max_length=max_length, truncation=True, pad_to_max_length=True) 

import numpy as np 
train_split = int(0.8 * len(tokenized_reviews["attention_mask"])) 
train_tokens = tokenized_reviews["input_ids"][:train_split] 
test_tokens = tokenized_reviews["input_ids"][train_split:] 
train_masks = tokenized_reviews["attention_mask"][:train_split] 
test_masks = tokenized_reviews["attention_mask"][train_split:] 
sentiments = list(imdb_df.sentiment) 
labels = np.array([[0,1] if sentiment == "positive" else [1,0] for sentiment in sentiments]) 
train_labels = labels[:train_split] 
test_labels = labels[train_split:] 

In [None]:
model.fit([train_tokens,train_masks],train_labels, epochs=5)