# Imports

In [None]:
! pip install transformers

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, TFAutoModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Constants

In [3]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

# Accessing Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
root_path = "drive/MyDrive/NLP/Language Detection/"

In [None]:
df = pd.read_csv(f"{root_path}data.csv")
df.head()

# Data Analysis

In [None]:
min = df["text"].apply(lambda x: len(x.split())).min()
max = df["text"].apply(lambda x: len(x.split())).max()
mean = df["text"].apply(lambda x: len(x.split())).mean()

print("min :", min)
print("max :", max)
print("mean :", mean)

# Pre Processing

## Removing Unnecessary Column

In [None]:
df = df.drop("Unnamed: 0", axis=1)
df.head(10)

## Remove Non Alphabetics Character from Data

In [None]:
df["text"] = df["text"].map(lambda sent: "".join(char.lower() for char in sent if (char.isalpha() or char == " ")))
df.head(10)

## Feature and Label Split

In [9]:
X = df.loc[:, "text"]
y = df.loc[:, "lang"]

## Map Label Values

In [None]:
maps = {"en": 0, "id": 1}
y = y.replace(maps)
y.head()

## Train, Valid, and Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13519094)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=13519094)

# Tokenization

In [12]:
def tokenize(texts, max_length):
  tokenizer = BertTokenizerFast.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    do_lower_case = True
  )

  result = tokenizer(
      text = texts,
      add_special_tokens = True,
      max_length = max_length,
      padding = 'max_length',
      truncation = True,
      return_tensors = 'tf'
  )

  return {
    'input_ids': result['input_ids'],
    'attention_mask': result['attention_mask'],
    'token_type_ids': result['token_type_ids']
  }

# Model

In [13]:
def FineTunedBERT(length, learning_rate):
  layer_bert = TFAutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME).bert
  input_bert = {
      "input_ids": Input(shape=(length,), name="input_ids", dtype="int32"),
      "token_type_ids": Input(shape=(length,), name="token_type_ids", dtype="int32"),
      "attention_mask": Input(shape=(length,), name="attention_mask", dtype="int32"),
  }

  x = layer_bert(input_bert)[0]
  x = LSTM(128)(x)
  x = Dense(64, activation='relu')(x)
  x = Dense(1, activation="sigmoid")(x)

  model = Model(inputs=input_bert, outputs=x)

  loss = "binary_crossentropy"
  optimizer = Adam(learning_rate=learning_rate)
  metrics = ["accuracy"]
  model.compile(
      loss = loss,
      optimizer = optimizer,
      metrics = metrics
  )

  return model

# Train

In [30]:
def train(max_length, learning_rate, batch_size, epochs):
  x_train = tokenize(list(X_train), max_length)
  x_val = tokenize(list(X_val), max_length)

  model = FineTunedBERT(
      length=len(x_train["input_ids"][0]),
      learning_rate = learning_rate
  )

  model.fit(
      x = x_train,
      y = y_train,
      batch_size = batch_size,
      epochs = epochs,
      validation_data = (x_val, y_val)
  )

  return model

# Experiments

In [18]:
physical_devices = tf.config.list_physical_devices("GPU")
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

In [None]:
model_a = train(512, 5e-5, 3, 1)

In [None]:
model_b = train(256, 5e-5, 3, 1)

In [None]:
model_c = train(512, 2e-5, 3, 1)

In [None]:
model_d = train(256, 2e-5, 3, 1)

# Evaluation

In [57]:
x_test = tokenize(list(X_test), 512)

y_pred_a = np.round(model_a.predict(x_test))
score_a = accuracy_score(y_test, y_pred_a)
print("Accuracy of model with max_length of 512 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_b = np.round(model_b.predict(x_test))
score_b = accuracy_score(y_test, y_pred_b)
print("Accuracy of model with max_length of 256 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 512)
y_pred_c = np.round(model_c.predict(x_test))
score_c = accuracy_score(y_test, y_pred_c)
print("Accuracy of model with max_length of 512 and learning rate of 2e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_d = np.round(model_d.predict(x_test))
score_d = accuracy_score(y_test, y_pred_d)
print("Accuracy of model with max_length of 256 and learning rate of 2e-5 :", score_a)

Accuracy of model with max_length of 512 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 512 and learning rate of 2e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 2e-5 : 1.0


# Export

In [58]:
# Save Model C
model_c.save(f"{root_path}model.h5")

In [59]:
model_load = tf.keras.models.load_model(f"{root_path}model.h5")

test = ["Today I woke up and drank coffee", "Hari ini saya jogging ke taman bersama ayah"]
test = tokenize(list(test), 512)

pred = np.round(model_load.predict(test, batch_size = 4))
pred





array([[0.],
       [1.]], dtype=float32)