# Imports

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 18.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 63.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 6.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertTokenizerFast, TFAutoModel, BertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Constants

In [3]:
BERT_BASE_UNCASED = 'bert-base-uncased'
INDOBERT = 'indobenchmark/indobert-base-p1'

# Data

## Accessing Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
root_path = "drive/MyDrive/NLP/Language Detection/"

In [6]:
df = pd.read_csv(f"{root_path}data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,lang
0,0,My recent family vacation to Jamaica is what c...,en
1,1,This year I traveled to Colorado in my camperv...,en
2,2,The most memorable event that happened to me w...,en
3,3,"Hari itu dimulai dengan sempurna, dengan perja...",id
4,4,"Growing up as a child, I remember going to a f...",en


## Data Analysis

In [7]:
min = df["text"].apply(lambda x: len(x.split())).min()
max = df["text"].apply(lambda x: len(x.split())).max()
mean = df["text"].apply(lambda x: len(x.split())).mean()

print("min :", min)
print("max :", max)
print("mean :", mean)

min : 4
max : 606
mean : 240.7246550689862


## Pre Processing

## Removing Unnecessary Column

In [8]:
df = df.drop("Unnamed: 0", axis=1)
df.head(10)

Unnamed: 0,text,lang
0,My recent family vacation to Jamaica is what c...,en
1,This year I traveled to Colorado in my camperv...,en
2,The most memorable event that happened to me w...,en
3,"Hari itu dimulai dengan sempurna, dengan perja...",id
4,"Growing up as a child, I remember going to a f...",en
5,"Saya dari Tampa, Florida, tetapi musim panas y...",id
6,Saya baru-baru ini mengetahui bahwa salah satu...,id
7,Terkadang saya merasa tidak akan pernah mengua...,id
8,I left my family and moved to another city on ...,en
9,UGH!!!! I'm so frustrated. I can't believe the...,en


## Remove Non Alphabetics Character from Data

In [9]:
df["text"] = df["text"].map(lambda sent: "".join(char.lower() for char in sent if (char.isalpha() or char == " ")))
df.head(10)

Unnamed: 0,text,lang
0,my recent family vacation to jamaica is what c...,en
1,this year i traveled to colorado in my camperv...,en
2,the most memorable event that happened to me w...,en
3,hari itu dimulai dengan sempurna dengan perjal...,id
4,growing up as a child i remember going to a fa...,en
5,saya dari tampa florida tetapi musim panas yan...,id
6,saya barubaru ini mengetahui bahwa salah satu ...,id
7,terkadang saya merasa tidak akan pernah mengua...,id
8,i left my family and moved to another city on ...,en
9,ugh im so frustrated i cant believe the crap t...,en


## Feature and Label Split

In [10]:
X = df.loc[:, "text"]
y = df.loc[:, "lang"]

## Map Label Values

In [11]:
maps = {"en": 0, "id": 1}
y = y.replace(maps)
y.head()

0    0
1    0
2    0
3    1
4    0
Name: lang, dtype: int64

## Train, Valid, and Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13519094)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=13519094)

# BERT

## Tokenization

In [13]:
def tokenize(texts, max_length):
  tokenizer = BertTokenizerFast.from_pretrained(
    BERT_BASE_UNCASED,
    do_lower_case = True
  )

  result = tokenizer(
      text = texts,
      add_special_tokens = True,
      max_length = max_length,
      padding = 'max_length',
      truncation = True,
      return_tensors = 'tf'
  )

  return {
    'input_ids': result['input_ids'],
    'attention_mask': result['attention_mask'],
    'token_type_ids': result['token_type_ids']
  }

## Model

In [14]:
def FineTunedBERT(length, learning_rate):
  layer_bert = TFAutoModel.from_pretrained(BERT_BASE_UNCASED).bert
  input_bert = {
      "input_ids": Input(shape=(length,), name="input_ids", dtype="int32"),
      "token_type_ids": Input(shape=(length,), name="token_type_ids", dtype="int32"),
      "attention_mask": Input(shape=(length,), name="attention_mask", dtype="int32"),
  }

  x = layer_bert(input_bert)[0]
  x = LSTM(128)(x)
  x = Dense(64, activation='relu')(x)
  x = Dense(1, activation="sigmoid")(x)

  model = Model(inputs=input_bert, outputs=x)

  loss = "binary_crossentropy"
  optimizer = Adam(learning_rate=learning_rate)
  metrics = ["accuracy"]
  model.compile(
      loss = loss,
      optimizer = optimizer,
      metrics = metrics
  )

  return model

## Train

In [15]:
def train(max_length, learning_rate, batch_size, epochs):
  x_train = tokenize(list(X_train), max_length)
  x_val = tokenize(list(X_val), max_length)

  model = FineTunedBERT(
      length=len(x_train["input_ids"][0]),
      learning_rate = learning_rate
  )

  model.fit(
      x = x_train,
      y = y_train,
      batch_size = batch_size,
      epochs = epochs,
      validation_data = (x_val, y_val)
  )

  return model

## Experiments

In [16]:
physical_devices = tf.config.list_physical_devices("GPU")
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

In [None]:
model_a = train(512, 5e-5, 3, 1)

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [None]:
model_b = train(256, 5e-5, 3, 1)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [17]:
model_c = train(512, 2e-5, 3, 1)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [None]:
model_d = train(256, 2e-5, 3, 1)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




## Evaluation

In [None]:
x_test = tokenize(list(X_test), 512)

y_pred_a = np.round(model_a.predict(x_test))
score_a = accuracy_score(y_test, y_pred_a)
print("Accuracy of model with max_length of 512 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_b = np.round(model_b.predict(x_test))
score_b = accuracy_score(y_test, y_pred_b)
print("Accuracy of model with max_length of 256 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 512)
y_pred_c = np.round(model_c.predict(x_test))
score_c = accuracy_score(y_test, y_pred_c)
print("Accuracy of model with max_length of 512 and learning rate of 2e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_d = np.round(model_d.predict(x_test))
score_d = accuracy_score(y_test, y_pred_d)
print("Accuracy of model with max_length of 256 and learning rate of 2e-5 :", score_a)

Accuracy of model with max_length of 512 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 512 and learning rate of 2e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 2e-5 : 1.0


In [None]:
# Save Model C
model_c.save(f"{root_path}model.h5")

In [None]:
model_load = tf.keras.models.load_model(f"{root_path}model.h5")



In [None]:
test = ["Today I woke up and drank coffee"]
test = tokenize(test, 512)

pred = np.round(model_load.predict(test))
result = pred[0][0]
if (result == 0): print("en")
else: print("id")

en


# IndoBERT

## Tokenization

In [None]:
def indoBERT_tokenize(texts):
  tokenizer = BertTokenizer.from_pretrained(
    INDOBERT,
    do_lower_case = True
  )

  result = tokenizer(
      text = texts,
      add_special_tokens = True,
      max_length = max_length,
      padding = 'max_length',
      truncation = True,
      return_tensors = 'tf'
  )

  return {
    'input_ids': result['input_ids'],
    'attention_mask': result['attention_mask'],
    'token_type_ids': result['token_type_ids']
  }

## Model

In [None]:
def IndoBERT(length, learning_rate):
  layer_bert = TFAutoModel.from_pretrained(INDOBERT).bert
  input_bert = {
      "input_ids": Input(shape=(length,), name="input_ids", dtype="int32"),
      "token_type_ids": Input(shape=(length,), name="token_type_ids", dtype="int32"),
      "attention_mask": Input(shape=(length,), name="attention_mask", dtype="int32"),
  }

  x = layer_bert(input_bert)[0]
  x = LSTM(128)(x)
  x = Dense(64, activation='relu')(x)
  x = Dense(1, activation="sigmoid")(x)

  model = Model(inputs=input_bert, outputs=x)

  loss = "binary_crossentropy"
  optimizer = Adam(learning_rate=learning_rate)
  metrics = ["accuracy"]
  model.compile(
      loss = loss,
      optimizer = optimizer,
      metrics = metrics
  )

  return model

## Train

In [None]:
def train_indobert(max_length, learning_rate, batch_size, epochs):
  x_train = tokenize(list(X_train), max_length)
  x_val = tokenize(list(X_val), max_length)

  model = IndoBERT(
      length=len(x_train["input_ids"][0]),
      learning_rate = learning_rate
  )

  model.fit(
      x = x_train,
      y = y_train,
      batch_size = batch_size,
      epochs = epochs,
      validation_data = (x_val, y_val)
  )

  return model

## Experiments

In [None]:
physical_devices = tf.config.list_physical_devices("GPU")
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

In [None]:
model_a_indobert = train_indobert(512, 5e-5, 3, 1)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [None]:
model_b_indobert = train_indobert(256, 5e-5, 3, 1)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [None]:
model_c_indobert = train_indobert(512, 2e-5, 3, 1)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [None]:
model_d_indobert = train_indobert(256, 2e-5, 3, 1)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




## Evaluation

In [None]:
x_test = tokenize(list(X_test), 512)

y_pred_a = np.round(model_a.predict(x_test))
score_a = accuracy_score(y_test, y_pred_a)
print("Accuracy of model with max_length of 512 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_b = np.round(model_b.predict(x_test))
score_b = accuracy_score(y_test, y_pred_b)
print("Accuracy of model with max_length of 256 and learning rate of 5e-5 :", score_a)

x_test = tokenize(list(X_test), 512)
y_pred_c = np.round(model_c.predict(x_test))
score_c = accuracy_score(y_test, y_pred_c)
print("Accuracy of model with max_length of 512 and learning rate of 2e-5 :", score_a)

x_test = tokenize(list(X_test), 256)
y_pred_d = np.round(model_d.predict(x_test))
score_d = accuracy_score(y_test, y_pred_d)
print("Accuracy of model with max_length of 256 and learning rate of 2e-5 :", score_a)

Accuracy of model with max_length of 512 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 5e-5 : 1.0
Accuracy of model with max_length of 512 and learning rate of 2e-5 : 1.0
Accuracy of model with max_length of 256 and learning rate of 2e-5 : 1.0


# Export

In [19]:
# Save Model C
model_c.save(f"{root_path}model.h5")

In [20]:
model_load = tf.keras.models.load_model(f"{root_path}model.h5")



# Testing and Error Analysis

## Testing

In [59]:
tokenized_test = tokenize(X_test.to_list(), 512)
pred = np.round(model_load.predict(tokenized_test))

score = accuracy_score(pred, y_test)
score



1.0

In [67]:
mapped_pred = ["en" if x==0 else "id" for x in pred]
mapped_pred[:10]

['en', 'id', 'id', 'id', 'en', 'en', 'id', 'en', 'en', 'en']

In [76]:
result = pd.DataFrame({
    'text': X_test,
    'detected_lang':mapped_pred
})
result

Unnamed: 0,text,detected_lang
9030,a couple monthss ago my brother in law and sis...,en
9076,saya ingat hari saya mengetahui tentang ginny ...,id
3993,pada tanggal juni adalah hari yang sangat be...,id
622,ku rindu bisik mu di telingaku seraya kau berk...,id
1576,i thought that leaving the army would be a goo...,en
...,...,...
1745,on may th of my grandmother unexpectedly pass...,en
1185,two weeks ago my six year old son fell down th...,en
2910,los angeles is the best and worst experience i...,en
8004,pernikahan tidak direncanakan saudariku bagai...,id


## Error Analysis

From the testing result above, we can conclude that the model performs perfectly with accuracy score of 100%. Although this remarkably well performance, the author had noted that the model was able to achieve this result as the test data is a sentence, not a word. When tested on a single word, the model may give a wrong prediction. This may be caused by a mixed use of language in dataset when the data contains the indonesian and english language altogether.

In [82]:
test = ["go", "pergi", "cry", "nangis", "watch", "nonton", "dry", "kering", "did", "melakukan"]
tokenized_test = tokenize(test, 512)

predx = np.round(model_load.predict(tokenized_test))
mapped_predx = ["en" if x==0 else "id" for x in predx]
resultx = pd.DataFrame({
    'text': test,
    'detected_lang':mapped_predx
})
resultx



Unnamed: 0,text,detected_lang
0,go,id
1,pergi,id
2,cry,id
3,nangis,id
4,watch,en
5,nonton,id
6,dry,id
7,kering,id
8,did,en
9,melakukan,id


Here we can see that the model peforms with an accuracy score of 70% when faced with word dataset. Nevertheless, the author decided not to add word dataset to the dataset as the usage of the application would be a paragraphed-story input not a word. In addition, adding word dataset would mislead sentences involving two different languages.