In [None]:
import torch
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

In [None]:
#pip install transformers

In [None]:
import spacy
from spacy.tokens import Doc , Span
import re

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

In [None]:
def read_dataset(file_path):
    examples = []
    with open(file_path, 'r', encoding='utf-8') as file:
        example = []
        for line in file:
            line = line.strip()
            if line.startswith('-'):
                if example:
                    examples.append(' '.join(example))
                example = [line]
            else:
                example.append(line)
        if example:
            examples.append(' '.join(example))
    return examples

In [None]:
def extract_entities(text):
    entity_regex = r'\[(.*?)\]\((.*?)\)'
    entities = re.findall(entity_regex, text)
    return entities

In [None]:
def preprocess_data(data):
    training_data = []
    for line in data:
        entities = extract_entities(line)
        doc_text = line
        ent_list = []
        for entity, tag in entities:
            start = line.index("[" + entity + "]")
            end = start + len(entity)
            ent_list.append((start, end, tag))

        training_data.append((doc_text, {"entities": ent_list}))
    return training_data

In [None]:
file_path = 'labeled.txt'
examples = read_dataset(file_path)
training_data = preprocess_data(examples)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
input_texts = [text for text, _ in training_data]

output_texts = [f"{text} [<ENT_START>] {tag} [<ENT_END>]" for text, tags in training_data for tag in tags]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
input_encodings = tokenizer(input_texts, return_tensors="tf", padding=True, truncation=True)
output_encodings = tokenizer(output_texts, return_tensors="tf", padding=True, truncation=True)

In [None]:
max_sequence_length = max(input_encodings["input_ids"].shape[1], output_encodings["input_ids"].shape[1])
input_encodings = tokenizer(input_texts, return_tensors="tf", padding="max_length", truncation=True, max_length=max_sequence_length)
output_encodings = tokenizer(output_texts, return_tensors="tf", padding="max_length", truncation=True, max_length=max_sequence_length)

In [None]:
output_ids = output_encodings["input_ids"]
labels = output_ids.numpy().copy()

In [None]:
#output_texts

In [None]:
#labels[labels == tokenizer.pad_token_id] = -100

In [None]:
train_size = int(0.8 * len(input_encodings["input_ids"]))
val_size = int(0.1 * len(input_encodings["input_ids"]))
test_size = int(0.1 * len(input_encodings["input_ids"]))

In [None]:
train_inputs = {k: v[:train_size] for k, v in input_encodings.items()}
train_labels = output_ids[:train_size]

val_inputs = {k: v[train_size:train_size + val_size] for k, v in input_encodings.items()}
val_labels = output_ids[train_size:train_size + val_size]

test_inputs = {k: v[-test_size:] for k, v in input_encodings.items()}
test_labels = output_ids[-test_size:]

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

In [None]:
with strategy.scope():
  model = TFGPT2LMHeadModel.from_pretrained("gpt2")
  model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
num_epochs = 20
batch_size = 64
steps_per_epoch = len(train_inputs["input_ids"]) // batch_size



In [None]:
model.fit(
    train_inputs, train_labels,
    batch_size=batch_size,
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=(val_inputs, val_labels)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x780616dff2e0>

In [None]:
test_loss, test_accuracy = model.evaluate(
    test_inputs,
    test_labels,
    batch_size=batch_size
)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.16901470720767975
Test Accuracy: 0.9317948222160339


In [None]:
model.save("C:\\Users\\admin\\Desktop")

In [None]:
#input_text = " - 1000 manatlıq Albalı plüs kart almaq istəyirəm."

In [None]:
#input_encoding = tokenizer(input_text, return_tensors="tf", padding=True, truncation=True)

In [None]:
#predictions = model.predict(input_encoding)
