# CHECK TOKENIZER

In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
import pandas as pd

In [2]:
def load_csv_dataset(file_path):
    return pd.read_csv(file_path, sep=';')

# Load the tokenizer for a specific language
def load_tokenizer(tokenizer_path):
    return Tokenizer.from_file(tokenizer_path)

# Function to view original and tokenized text
def view_text_and_tokenized_text(dataset, tokenizer_src, tokenizer_tgt, index):
    src_text = dataset.iloc[index]['Maanyan']
    tgt_text = dataset.iloc[index]['Indonesia']

    src_tokens = tokenizer_src.encode(src_text)
    tgt_tokens = tokenizer_tgt.encode(tgt_text)

    print("Original Maanyan Text:")
    print(src_text)
    print("\nTokenized Maanyan Text:")
    print(src_tokens.ids)
    
    print("\nOriginal Indonesia Text:")
    print(tgt_text)
    print("\nTokenized Indonesia Text:")
    print(tgt_tokens.ids)

# Function to tokenize input text and view token IDs
def tokenize_and_view(text, tokenizer):
    tokens = tokenizer.encode(text)
    
    print("Input Text:")
    print(text)
    print("\nToken IDs:")
    print(tokens.ids)

In [3]:
# Load the CSV dataset
csv_file_path = 'D:\Oscar Main Base\File Online\Aplikasi_Penerjemah\Develop\pytorch-transformer-main\datasets\datasetma4.csv'
dataset = load_csv_dataset(csv_file_path)

# Load the tokenizers
tokenizer_src_path = 'tokenizer_Maanyan.json'
tokenizer_tgt_path = 'tokenizer_Indonesia.json'
tokenizer_src = load_tokenizer(tokenizer_src_path)
tokenizer_tgt = load_tokenizer(tokenizer_tgt_path)

# View original and tokenized text for a specific index
index_to_view = 20  # Change this to the index you want to view
view_text_and_tokenized_text(dataset, tokenizer_src, tokenizer_tgt, index_to_view)

print("=========================================================================")

# Input text
input_text = "tolong"
# Tokenize and view token IDs
# tokenize_and_view(input_text, tokenizer_src)
tokenize_and_view(input_text, tokenizer_tgt)
print("=========================================================================")

# Input text
input_text = "mulek"
# Tokenize and view token IDs
# tokenize_and_view(input_text, tokenizer_src)
tokenize_and_view(input_text, tokenizer_tgt)
print("=========================================================================")

# Input text
input_text = "aku terkejut"
# Tokenize and view token IDs
# tokenize_and_view(input_text, tokenizer_src)
tokenize_and_view(input_text, tokenizer_tgt)

Original Maanyan Text:
amun naun kaiuh iwara upi yiru anri aratini

Tokenized Maanyan Text:
[17, 75, 122, 42, 55, 6, 19, 66]

Original Indonesia Text:
jika kamu dapat memberitahukan mimpi itu dan maknanya

Tokenized Indonesia Text:
[57, 10, 39, 45, 46, 6, 4, 77]
Input Text:
tolong

Token IDs:
[231]
Input Text:
mulek

Token IDs:
[0]
Input Text:
aku terkejut

Token IDs:
[7, 0]


In [59]:
10**-4

0.0001

# BLEU SCORE

In [36]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from train import translate_input_string_bleu, get_model, get_ds_csv, translate_input_string
from pathlib import Path
import torch
import torch.nn as nn
from config import get_config, get_weights_file_path


## Setup Model

In [37]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
config = get_config()
# config['model_basename'] = "tmodel_eight_four_id_"
config['model_basename'] = "tmodel_eight_four_"
# Training bahasa target menjadi bahasa Maanyan
# config['lang_src'] = "Indonesia"
# config['lang_tgt'] = "Maanyan"
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds_csv(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = get_weights_file_path(config, f"60")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

Using device: cuda
Max length of source sentence: 22
Max length of target sentence: 24


<All keys matched successfully>

In [52]:
input_string = "hanyu"
translation = translate_input_string(model, input_string, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
print(f"Input String: {input_string}")
print(f"Translation: {translation}")

Input String: hanyu
Translation: kamu kamu kamu dan kamu kamu kamu kamu kamu kamu


In [53]:
def evaluate_translation(reference, translation):

    reference_tokens = reference.lower().split()  # Ubah ke huruf kecil dan pecah menjadi token
    translation_tokens = translation.lower().split()

    # Hitung jumlah token yang cocok antara referensi dan terjemahan
    matching_tokens = set(reference_tokens) & set(translation_tokens)

    # Hitung persentase token yang cocok
    matching_percentage = (len(matching_tokens) / len(reference_tokens)) * 100.0

    return matching_percentage

# Contoh penggunaan
reference = "kamu"
percentage = evaluate_translation(reference, translation)
print(f"Persentase token cocok: {percentage}%")

Persentase token cocok: 100.0%
