In [9]:
!wandb off

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


W&B offline. Running your script from this directory will only write metadata locally. Use wandb disabled to completely turn off W&B.


In [10]:
import transformers
from transformers import AutoTokenizer

model_name = "hatmimoha/arabic-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [11]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().split('\n\n')  # Split sentences
    sentences = []
    labels = []
    for item in data:
        words = []
        tags = []
        lines = item.splitlines()
        for line in lines:
            if line:
                word, tag = line.split()
                words.append(word)
                tags.append(tag)
        sentences.append(words)
        labels.append(tags)
    return sentences, labels

In [12]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = []
    tokenized_labels = []

    for sentence, label in zip(sentences, labels):
        # Tokenize the input sentence with word-level tokenization
        tokenized_input = tokenizer(sentence, 
                                    is_split_into_words=True, 
                                    padding='max_length', 
                                    max_length=64, 
                                    truncation=True,
                                    return_tensors='pt')

        word_ids = tokenized_input.word_ids()  # Map tokens back to their word index
        label_ids = []
        
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens or padding tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Take the original label
            else:
                label_ids.append(label[word_idx].replace('B-', 'I-'))  # Make sure to align subwords with 'I-'
            
            previous_word_idx = word_idx

        tokenized_inputs.append(tokenized_input)
        tokenized_labels.append(label_ids)

    return tokenized_inputs, tokenized_labels

In [13]:
def create_label_mappings(labels):
    unique_labels = set(label for sublist in labels for label in sublist)
    label_to_id = {label: i for i, label in enumerate(unique_labels)}
    id_to_label = {i: label for label, i in label_to_id.items()}
    return label_to_id, id_to_label

def convert_labels_to_ids(labels, label_to_id):
    return [[label_to_id.get(label, -100) for label in sublist] for sublist in labels]

# def format_data_for_transformers(tokenized_inputs, tokenized_labels):
#     input_ids = [input["input_ids"] for input in tokenized_inputs]
#     attention_masks = [input["attention_mask"] for input in tokenized_inputs]
#     token_type_ids = [input["token_type_ids"] for input in tokenized_inputs]

#     formatted_data = []
#     for i in range(len(input_ids)):
#         formatted_data.append({
#             "input_ids": input_ids[i],
#             "attention_mask": attention_masks[i],
#             "token_type_ids": token_type_ids[i],
#             "labels": torch.tensor(tokenized_labels[i])
#         })
    
#     return formatted_data

In [14]:
from sklearn.utils import shuffle

X, y = load_data("/kaggle/input/ner-tuning-keemet/processed_final_c.txt")
X, y = shuffle(X, y, random_state=42)

label_to_id, id_to_label = create_label_mappings(y)

X_train, y_train = X[:128], y[:128]
X_test, y_test = X[128:], y[128:]

X_train_tokenized, y_train_tokenized = tokenize_and_align_labels(X_train, y_train)
X_test_tokenized, y_test_tokenized = tokenize_and_align_labels(X_test, y_test)

y_train_ids = convert_labels_to_ids(y_train_tokenized, label_to_id)
y_test_ids = convert_labels_to_ids(y_test_tokenized, label_to_id)

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
import torch

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Freeze the first 6 layers
for param in model.bert.encoder.layer[:6].parameters():
    param.requires_grad = False

2024-08-13 00:52:41.353420: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 00:52:41.353533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 00:52:41.487490: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
from torch.utils.data import Dataset

class TokenizedDataset(Dataset):
    def __init__(self, X_tokenized, y_labels):
        self.input_ids = [item['input_ids'].squeeze(0) for item in X_tokenized]
        self.token_type_ids = [item['token_type_ids'].squeeze(0) for item in X_tokenized]
        self.attention_mask = [item['attention_mask'].squeeze(0) for item in X_tokenized]
        self.labels = [torch.tensor(label) for label in y_labels]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'token_type_ids': self.token_type_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }


train_dataset = TokenizedDataset(X_train_tokenized, y_train_ids)
test_dataset = TokenizedDataset(X_test_tokenized, y_test_ids)

In [18]:
from torch.utils.data import DataLoader

loader = DataLoader(train_dataset, batch_size=2, shuffle=False)

In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_steps=10,
    eval_steps=10,
    logging_dir='./logs',
#     no_cuda=True
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_f1',
    save_strategy = "epoch",
    save_total_limit=1,
)




In [20]:
from transformers import EvalPrediction
from sklearn.metrics import classification_report

def compute_metrics(p: EvalPrediction):
    predictions, labels = p.predictions, p.label_ids
    predictions = predictions.argmax(axis=-1)
    
    # Flatten the sequences for classification_report
    true_labels = [label for doc in labels for label in doc if label != -100]
    pred_labels = [pred for doc, true_doc in zip(predictions, labels) for pred, true in zip(doc, true_doc) if true != -100]
    
    # Generate the classification report
    report = classification_report(true_labels, pred_labels, labels=list(label_to_id.values()), target_names=list(label_to_id.keys()), output_dict=True, zero_division=0)
    
    # Return the metrics of interest
    return {
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1': report['macro avg']['f1-score']
    }

def compute_metrics_2(p: EvalPrediction):
    predictions, labels = p.predictions, p.label_ids
    predictions = predictions.argmax(axis=-1)
    
    # Flatten the sequences for classification_report
    true_labels = [label for doc in labels for label in doc if label != -100]
    pred_labels = [pred for doc, true_doc in zip(predictions, labels) for pred, true in zip(doc, true_doc) if true != -100]
    
    # Generate the classification report
    report = classification_report(true_labels, pred_labels, labels=list(label_to_id.values()), target_names=list(label_to_id.keys()), output_dict=False, zero_division=0)
    
    print(report)
    # Return the metrics of interest
    return {"done": 1}

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


In [22]:
trainer.train()





Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,6.508733,0.022727,0.000518,0.001012
2,7.527600,3.179518,0.035839,0.031832,0.033717
3,2.863700,1.900326,0.04181,0.056474,0.045018
4,1.738000,1.492466,0.156338,0.138372,0.110409
5,1.257500,1.229505,0.252598,0.176951,0.153734
6,1.257500,1.031446,0.328426,0.32785,0.29013
7,0.967700,0.875782,0.448541,0.386006,0.354163
8,0.782400,0.767917,0.455407,0.449579,0.419344
9,0.592300,0.700209,0.572263,0.548955,0.526949
10,0.478000,0.611884,0.679147,0.66215,0.654868


TrainOutput(global_step=400, training_loss=0.4620898695103824, metrics={'train_runtime': 185.0299, 'train_samples_per_second': 34.589, 'train_steps_per_second': 2.162, 'total_flos': 209069533593600.0, 'train_loss': 0.4620898695103824, 'epoch': 50.0})

In [36]:
trainer.evaluate()



              precision    recall  f1-score   support

     B-Units       0.82      0.74      0.78        38
       I-Age       0.88      1.00      0.93        21
    B-Prices       0.84      0.91      0.87        23
  I-Currency       0.82      0.54      0.65        26
  I-Quantity       0.50      0.67      0.57         3
     I-Units       0.88      0.80      0.84        65
     I-Dates       0.94      0.89      0.91        53
     I-Times       0.58      0.88      0.70        43
     B-Dates       0.86      0.71      0.77        17
     B-Times       0.56      0.77      0.65        13
       B-Age       1.00      1.00      1.00         6
  B-Currency       0.64      0.56      0.60        16
    B-Colors       1.00      0.38      0.55         8
  B-Quantity       0.81      0.74      0.77        34
    I-Prices       0.44      0.57      0.50        21
           O       0.97      0.96      0.96       483

    accuracy                           0.88       870
   macro avg       0.78   

{'eval_loss': 0.5492228269577026,
 'eval_done': 1,
 'eval_runtime': 0.1924,
 'eval_samples_per_second': 306.675,
 'eval_steps_per_second': 41.583}