In [1]:
import os

os.chdir('..')

In [2]:
import json
import os

# File paths
labelbox_file = 'data/FineTune.ndjson'
sentences_dir = 'data/sentences/'
output_file = 'processed_final_c.txt'


# Initialize output list
output_lines = []

# Process each line in the .ndjson file
with open(labelbox_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # Extract relevant information
        external_id = data['data_row']['external_id']
        sentence_file_path = os.path.join(sentences_dir, external_id)
        
        # Read the sentence from the text file
        with open(sentence_file_path, 'r') as f_sentence:
            sentence = f_sentence.read().strip()
        
        # Initialize entity annotations
        annotations = data['projects']['clzoe997o0p9q071hdqdm5f51']['labels'][0]['annotations']['objects']
        
        # Tokenize the sentence into words
        words = sentence.split()
        
        # Initialize word-level annotations
        word_annotations = ['O'] * len(words)  # Default label is 'O' (Outside)
        
        # Map character-level annotations to word-level annotations
        current_char_idx = 0
        for i, word in enumerate(words):
            word_start_idx = current_char_idx
            word_end_idx = current_char_idx + len(word) - 1
            
            # Check if the word is part of an annotation
            for annotation in annotations:
                start_idx = annotation['location']['start']
                end_idx = annotation['location']['end']
                entity = annotation['name']
                
                # Correctly label words within the annotation range
                if word_end_idx >= start_idx and word_start_idx <= end_idx:
                    if word_start_idx == start_idx:
                        word_annotations[i] = f'B-{entity}'
                    else:
                        word_annotations[i] = f'I-{entity}'
            
            # Move to the next word
            current_char_idx += len(word) + 1  # +1 for the space
        
        # Create the final output for this sentence
        for word, annotation in zip(words, word_annotations):
            output_lines.append(f"{word}\t{annotation}")
        
        # Add a blank line to separate examples in the output file
        output_lines.append("")

# Write all processed data to the output file
with open(output_file, 'w') as f:
    f.write('\n'.join(output_lines))

print(f"Processed data saved to {output_file}")


Processed data saved to processed_final_c.txt


In [3]:
import transformers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")




In [4]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().split('\n\n')  # Split sentences
    sentences = []
    labels = []
    for item in data:
        words = []
        tags = []
        lines = item.splitlines()
        for line in lines:
            if line:
                word, tag = line.split()
                words.append(word)
                tags.append(tag)
        sentences.append(words)
        labels.append(tags)
    return sentences, labels

In [39]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = []
    tokenized_labels = []

    for sentence, label in zip(sentences, labels):
        # Tokenize the input sentence with word-level tokenization
        tokenized_input = tokenizer(sentence, 
                                    is_split_into_words=True, 
                                    padding='max_length', 
                                    max_length=64, 
                                    truncation=True,
                                    return_tensors='pt')

        word_ids = tokenized_input.word_ids()  # Map tokens back to their word index
        label_ids = []
        
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens or padding tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Take the original label
            else:
                label_ids.append(label[word_idx].replace('B-', 'I-'))  # Make sure to align subwords with 'I-'
            
            previous_word_idx = word_idx

        tokenized_inputs.append(tokenized_input)
        tokenized_labels.append(label_ids)

    return tokenized_inputs, tokenized_labels


In [40]:
sentences, labels = load_data("processed_final_c.txt")

tokenized_inputs, tokenized_labels = tokenize_and_align_labels(sentences, labels)

In [41]:
tokenized_labels

[[-100,
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Times',
  'I-Times',
  'I-Times',
  'I-Times',
  'B-Dates',
  'I-Dates',
  'I-Dates',
  'I-Dates',
  'I-Dates',
  'I-Dates',
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [-100,
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Colors',
  'I-Colors',
  'O',
  'O',
  'O',
  'B-Prices',
  'I-Prices',
  'I-Prices',
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -10

In [28]:
def decode_and_align_labels(tokenized_inputs, tokenized_labels, tokenizer):
    sentences = []
    labels = []

    for tokenized_input, label_ids in zip(tokenized_inputs, tokenized_labels):
        tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0])
        word_ids = tokenized_input.word_ids()

        sentence = []
        sentence_labels = []

        previous_word_idx = None
        current_word = ''
        current_labels = []

        for token, word_idx, label_id in zip(tokens, word_ids, label_ids):
            if word_idx is None:
                continue
            elif word_idx != previous_word_idx:
                if current_word:
                    sentence.append(current_word)
                    # Choose the most common label for the word
                    most_common_label = max(set(current_labels), key=current_labels.count)
                    sentence_labels.append(most_common_label)
                current_word = token if not token.startswith('##') else token[2:]
                current_labels = [label_id]
            else:
                current_word += token[2:] if token.startswith('##') else token
                current_labels.append(label_id)

            previous_word_idx = word_idx

        if current_word:
            sentence.append(current_word)
            most_common_label = max(set(current_labels), key=current_labels.count)
            sentence_labels.append(most_common_label)

        sentences.append(sentence)
        labels.append(sentence_labels)

    return sentences, labels

In [29]:
def create_label_mappings(labels):
    unique_labels = set(label for sublist in labels for label in sublist)
    label_to_id = {label: i for i, label in enumerate(unique_labels)}
    id_to_label = {i: label for label, i in label_to_id.items()}
    return label_to_id, id_to_label

def convert_labels_to_ids(labels, label_to_id):
    return [[label_to_id.get(label, -100) for label in sublist] for sublist in labels]

def format_data_for_transformers(tokenized_inputs, tokenized_labels):
    input_ids = [input["input_ids"] for input in tokenized_inputs]
    attention_masks = [input["attention_mask"] for input in tokenized_inputs]
    token_type_ids = [input["token_type_ids"] for input in tokenized_inputs]

    formatted_data = []
    for i in range(len(input_ids)):
        formatted_data.append({
            "input_ids": input_ids[i],
            "attention_mask": attention_masks[i],
            "token_type_ids": token_type_ids[i],
            "labels": tokenized_labels[i]
        })
    
    return formatted_data


In [44]:
X, y = load_data("processed_final_c.txt")

label_to_id, id_to_label = create_label_mappings(y)

X_train, y_train = X[:150], y[:150]
X_test, y_test = X[150:], y[150:]

X_train_tokenized, y_train_tokenized = tokenize_and_align_labels(X_train, y_train)
X_test_tokenized, y_test_tokenized = tokenize_and_align_labels(X_test, y_test)

y_train_ids = convert_labels_to_ids(y_train_tokenized, label_to_id)
y_test_ids = convert_labels_to_ids(y_test_tokenized, label_to_id)

formatted_train = format_data_for_transformers(X_train_tokenized, y_train_ids)
formatted_test = format_data_for_transformers(X_test_tokenized, y_test_ids)

In [45]:
formatted_train

[{'input_ids': tensor([[    2, 16478,    82, 58860,   221, 11388, 44764, 50649, 48980, 20311,
           11258, 16299,  1179,    31,  1623, 56120, 24832,   250,  5732,  9504,
             264,    20,     3,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
import torch

# Load the model and tokenizer
model_name = "hatmimoha/arabic-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [None]:
from datasets import Dataset

def convert_to_dataset(formatted_data):
    # Convert to a Hugging Face Dataset
    dataset = Dataset.from_dict({
        'input_ids': [data['input_ids'] for data in formatted_data],
        'attention_mask': [data['attention_mask'] for data in formatted_data],
        'labels': [data['labels'] for data in formatted_data]
    })
    return dataset

train_dataset = convert_to_dataset(formatted_train)
test_dataset = convert_to_dataset(formatted_test)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    logging_dir='./logs',
    no_cuda=True
)


In [None]:
from transformers import EvalPrediction
from sklearn.metrics import classification_report

def compute_metrics(p: EvalPrediction):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels = [[label for label in doc if label != -100] for doc in labels]
    pred_labels = [[pred for pred, true in zip(doc, true_doc) if true != -100] for doc, true_doc in zip(predictions, true_labels)]
    report = classification_report(true_labels, pred_labels, labels=list(label_to_id.keys()), target_names=list(label_to_id.keys()), output_dict=True)
    return {
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1': report['macro avg']['f1-score']
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics
)


In [None]:
model = model.to('cpu')

In [None]:
trainer.train()


In [None]:
from torch.utils.data import DataLoader

In [None]:



train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False)

In [None]:
for i, batch in enumerate(train_loader):
    bools = len(batch['labels']) == len(batch['input_ids'])
    if not bools:
        print(i)
    if i == 74:
        print(len(batch))
        print(len(batch['input_ids']))
        print(len(batch['labels']))
        print(len(batch['attention_mask']))