In [None]:
!pip install transformers==4.37.2

Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed tokenizers-0.15.2 transformers-4.37.2


In [None]:
import os
import re
import json
import string
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification, BertConfig
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.layers import Dropout
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from keras.layers import Activation, Dense
import transformers

max_len = 512

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install jpype1

Collecting jpype1
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jpype1
Successfully installed jpype1-1.5.0


In [None]:
from jpype import startJVM, shutdownJVM, JClass, JString

# Path to the Zemberek jar file
zemberek_jar_path = "/content/drive/MyDrive/datasets/zemberek-full.jar"

# Check if the JAR file exists
if not os.path.exists(zemberek_jar_path):
    raise FileNotFoundError(f"The specified JAR file path does not exist: {zemberek_jar_path}")

# Start the JVM with Zemberek
startJVM("-ea", f"-Djava.class.path={zemberek_jar_path}")

TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
#WordAnalysis = JClass('zemberek.morphology.analysis.WordAnalysis')

# Initialize the morphology object
morphology = TurkishMorphology.createWithDefaults()


In [None]:
# Set-up BERT tokenizer

In [None]:
path = "/content/gdrive/MyDrive/TurkishQA/"
models_path = path + "models/"
MODEL_NAME = "dbmdz/bert-base-turkish-cased"

In [None]:
slow_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
splitted_model =  MODEL_NAME.split("/")
save_path = models_path + splitted_model[0] + "-" + splitted_model[1] + "/"

if not os.path.exists(save_path):
    os.makedirs(save_path)

slow_tokenizer.save_pretrained(save_path)
tokenizer = BertWordPieceTokenizer(save_path + "vocab.txt", lowercase=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [None]:
# Preprocess The Data

In [None]:
class TurkishSquadDataset:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = int(start_char_idx)
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def normalize_and_lemmatize(self, text):
        analysis = morphology.analyzeSentence(JString(text))
        result = []
        for word_analysis in analysis:
            lemmas = word_analysis.getLemmas()
            if lemmas:
                result.append(lemmas[0])
            else:
                result.append(word_analysis.getWord())
        return ' '.join(result)

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets

In [None]:
def read_json(dataset_path):
    with open(dataset_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    return data

def json_to_list(json_dataset):
    dataset = []
    for item in json_dataset["data"]:
        for paragraph in item["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [answer["text"] for answer in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                dataset.append(TurkishSquadDataset(question, context, start_char_idx, answer_text, all_answers))
    print("Number of questions: ", len(dataset))
    return dataset

def create_input_targets(dataset):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in dataset:
        if not item.skip:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))

    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]

    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

def find_max_length(dataset):
    max_ = 0
    index = 0
    for i, element in enumerate(dataset):
        tokenized_question = tokenizer.encode(element.question)
        tokenized_context = tokenizer.encode(element.context)
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]

        if len(input_ids) > max_:
            max_ = len(input_ids)
            index = i

    print("Max length: {}, Index: {}".format(max_, index))
    return max_

def train_test_split(dataset):
    random.shuffle(dataset)
    cut = int(len(dataset) * 0.1)
    train, test = dataset[:-cut], dataset[-cut:]

    return train, test


In [None]:
def create_model():
    encoder = TFBertModel.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)

    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    embedding = encoder(
        input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = tf.keras.layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = tf.keras.layers.Flatten()(start_logits)

    end_logits = tf.keras.layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = tf.keras.layers.Flatten()(end_logits)

    start_probs = tf.keras.layers.Activation(tf.keras.activations.softmax, name="start_probs")(start_logits)
    end_probs = tf.keras.layers.Activation(tf.keras.activations.softmax, name="end_probs")(end_logits)

    model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])

    return model

In [None]:
def create_model_complex():
    encoder = TFBertModel.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)

    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)

    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    dropout = Dropout(0.1)(embedding)
    dense1 = Dense(256, activation='relu')(dropout)
    dropout1 = Dropout(0.1)(dense1)
    dense2 = Dense(128, activation='relu')(dropout1)
    dropout2 = Dropout(0.1)(dense2)

    start_logits = tf.keras.layers.Dense(1, name="start_logit", use_bias=False)(dropout)
    start_logits = tf.keras.layers.Flatten()(start_logits)

    end_logits = tf.keras.layers.Dense(1, name="end_logit", use_bias=False)(dropout)
    end_logits = tf.keras.layers.Flatten()(end_logits)

    start_probs = tf.keras.layers.Activation(tf.keras.activations.softmax)(start_logits)
    end_probs = tf.keras.layers.Activation(tf.keras.activations.softmax)(end_logits)

    model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [None]:
#Load The Dataset

In [None]:
#train_path = "/content/drive/MyDrive/datasets/train-v0.1.json"
#eval_path = "/content/drive/MyDrive/datasets/dev-v0.1.json"

In [None]:
train_path = "/content/drive/MyDrive/datasets/QA/final_train_data_v2.json"
train_path_2 = "/content/drive/MyDrive/datasets/QA/xquad_tr.json"
train_path_3 = "/content/drive/MyDrive/datasets/QA/corrected_train-v0.1.json"
#train_path_4 = "/content/drive/MyDrive/datasets/QA/squad-tr-train-v1.0.0-cleaned.json"
eval_path = "/content/drive/MyDrive/datasets/QA/final_dev_data_v2.json"

In [None]:
json_dataset = read_json(train_path)
json_dataset_2 = read_json(train_path_2)
json_dataset_3 = read_json(train_path_3)
#json_dataset_4 = read_json(train_path_4)


merged_dataset = {
    "data": json_dataset["data"] + json_dataset_2["data"] + json_dataset_3["data"] #+ json_dataset_4["data"]
}


dataset = json_to_list(merged_dataset)
data = dataset.copy()

for item in data:
    item.preprocess()

x, y = create_input_targets(data)

train_dataset, test_dataset = train_test_split(data)
x_train, y_train = create_input_targets(train_dataset)
x_test, y_test = create_input_targets(test_dataset)

Number of questions:  23719


In [None]:
configuration = BertConfig()

In [None]:
#pip install --upgrade tensorflow transformers

In [None]:
print("All TPU devices: ", tf.config.list_logical_devices('TPU'))

All TPU devices:  [LogicalDevice(name='/device:TPU:0', device_type='TPU'), LogicalDevice(name='/device:TPU:1', device_type='TPU'), LogicalDevice(name='/device:TPU:2', device_type='TPU'), LogicalDevice(name='/device:TPU:3', device_type='TPU'), LogicalDevice(name='/device:TPU:4', device_type='TPU'), LogicalDevice(name='/device:TPU:5', device_type='TPU'), LogicalDevice(name='/device:TPU:6', device_type='TPU'), LogicalDevice(name='/device:TPU:7', device_type='TPU')]


In [None]:
use_tpu = True
if use_tpu:
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
    except ValueError as e:
        print("TPU'ya bağlanılamadı:", e)
        print("Alternatif kaynağa geçiliyor...")
        use_tpu = False

    if use_tpu:
        with strategy.scope():
            model = create_model()
else:
    model = create_model()

if not use_tpu:
    model = create_model()

model.summary()

Running on TPU  


tf_model.h5:   0%|          | 0.00/545M [00:00<?, ?B/s]

Some layers from the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 token_type_ids (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                              

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_exact(a_gold, a_pred):
    return int(a_gold == a_pred)

def evaluate(model, x_test, y_test):
    y_pred = model.predict(x_test)
    start_preds = np.argmax(y_pred[0], axis=1)
    end_preds = np.argmax(y_pred[1], axis=1)
    start_true = y_test[0]
    end_true = y_test[1]

    exact_match = []
    f1_scores = []

    for i in range(len(start_true)):
        pred_ans = (start_preds[i], end_preds[i])
        true_ans = (start_true[i], end_true[i])

        exact_match.append(compute_exact(pred_ans, true_ans))

        # Calculate F1 Score
        pred_start, pred_end = pred_ans
        true_start, true_end = true_ans

        pred_tokens = set(range(pred_start, pred_end + 1))
        true_tokens = set(range(true_start, true_end + 1))

        common_tokens = pred_tokens.intersection(true_tokens)
        if len(common_tokens) == 0:
            f1 = 0
        else:
            precision = len(common_tokens) / len(pred_tokens)
            recall = len(common_tokens) / len(true_tokens)
            f1 = 2 * (precision * recall) / (precision + recall)
        f1_scores.append(f1)

    exact_match_rate = np.mean(exact_match)
    avg_f1_score = np.mean(f1_scores)

    accuracy = accuracy_score(start_true, start_preds) * 0.5 + accuracy_score(end_true, end_preds) * 0.5

    print("Exact Match: {:.4f}".format(exact_match_rate))
    print("F1 Score: {:.4f}".format(avg_f1_score))
    print("Accuracy: {:.4f}".format(accuracy))

    return avg_f1_score, accuracy, exact_match_rate

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-7)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')

In [None]:
model.fit(
    x_train,
    y_train,
    validation_data=(x_test, y_test),
    epochs=5,
    verbose=2,
    batch_size=32,
    callbacks=[reduce_lr, early_stopping, checkpoint],
)

Epoch 1/5


  saving_api.save_model(


588/588 - 184s - loss: 0.3928 - start_probs_loss: 0.1485 - end_probs_loss: 0.2443 - val_loss: 1.0743 - val_start_probs_loss: 0.4605 - val_end_probs_loss: 0.6137 - lr: 3.0000e-05 - 184s/epoch - 313ms/step
Epoch 2/5
588/588 - 113s - loss: 0.2472 - start_probs_loss: 0.0907 - end_probs_loss: 0.1566 - val_loss: 1.1292 - val_start_probs_loss: 0.4527 - val_end_probs_loss: 0.6764 - lr: 3.0000e-05 - 113s/epoch - 191ms/step
Epoch 3/5
588/588 - 111s - loss: 0.1999 - start_probs_loss: 0.0773 - end_probs_loss: 0.1226 - val_loss: 1.1154 - val_start_probs_loss: 0.4599 - val_end_probs_loss: 0.6555 - lr: 3.0000e-05 - 111s/epoch - 188ms/step
Epoch 4/5
588/588 - 117s - loss: 0.0990 - start_probs_loss: 0.0373 - end_probs_loss: 0.0618 - val_loss: 1.1775 - val_start_probs_loss: 0.5003 - val_end_probs_loss: 0.6772 - lr: 6.0000e-06 - 117s/epoch - 198ms/step


<keras.src.callbacks.History at 0x7a2784e0ebf0>

In [None]:
evaluate(model, x_test, y_test)

Exact Match: 0.8509
F1 Score: 0.9233
Accuracy: 0.8978


(0.9232514293660464, 0.8978084802286803, 0.8508813720819438)