In [None]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

text="Tämä on yksi virke, tässä on yksi pitkäsana."
tokens = tokenizer.tokenize(text)
print(tokens)
labels = ["o", "o", "o", "B-Hob", "o"]

for i in range(0, len(tokens)):
    for j in range(i+1, len(tokens)):
        print(tokens[i:j])

In [None]:
from transformers import AutoTokenizer
from fuzzywuzzy import fuzz
import json

tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def save_to_json_file(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

def find_best_match(phrase, target_text, threshold):
    best_match = None
    highest_score = threshold
    for n in range(1, len(target_text.split()) + 1):
        ngrams = [' '.join(target_text.split()[i:i + n]) for i in range(len(target_text.split()) - n + 1)]
        for ngram in ngrams:
            score = fuzz.ratio(phrase.lower(), ngram.lower())
            if score > highest_score:
                best_match = ngram
                highest_score = score
    return best_match

def tokenize_and_label(source_text, entities, source_index):
    tokenized_text = tokenizer.tokenize(source_text)
    labels = ['o'] * len(tokenized_text)
    match_results = []

    for entity, label_prefix in entities:
        if entity:
            match = None
            found_threshold = None
            for threshold in range(90, 49, -10):
                match = find_best_match(entity, source_text, threshold)
                if match:
                    found_threshold = threshold
                    match_results.append({"index": source_index, "entity": entity, "match": match, "threshold": threshold})
                    print(f"Matching entity '{entity}' with '{match}' at threshold {threshold}")
                    break

            if not match:
                match_results.append({"index": source_index, "entity": entity, "match": None, "threshold": None})

            if match:
                match_tokens = tokenizer.tokenize(match)
                match_start_idx = None

                for i in range(len(tokenized_text) - len(match_tokens) + 1):
                    if tokenized_text[i:i+len(match_tokens)] == match_tokens:
                        match_start_idx = i
                        break

                if match_start_idx is not None:
                    for idx in range(match_start_idx, match_start_idx + len(match_tokens)):
                        if tokenized_text[idx] not in [".", ",", ";", ":"]:
                            if idx == match_start_idx:
                                labels[idx] = f'B-{label_prefix}'
                            else:
                                labels[idx] = f'I-{label_prefix}'

    return tokenized_text, labels, match_results

def process_data(sourceData, apiResults, data_limit):
    processed_data = []
    all_match_results = []

    for source_record in sourceData[:data_limit]:
        source_text = source_record["source_text"]
        source_index = source_record["index"]
        api_record = apiResults[source_index]

        entities = [
            (person_hobby, "P-HOB") for person_hobby in api_record["api_response"].split("\nPersonHobbies: ")[1].split("\n")[0].split(", ")
        ] + [
            (spouse_hobby, "S-HOB") for spouse_hobby in api_record["api_response"].split("\nSpouseHobbies: ")[1].split("\n")[0].split(", ")
        ] + [
            (person_org, "P-ORG") for person_org in api_record["api_response"].split("\nPersonSocialOrgs: ")[1].split("\n")[0].split(", ")
        ] + [
            (spouse_org, "S-ORG") for spouse_org in api_record["api_response"].split("\nSpouseSocialOrgs: ")[1].split("\n")[0].split(", ")
        ]

        tokenized_text, labels, match_results = tokenize_and_label(source_text, entities, source_index)
        processed_data.append({"index": source_index, "tokenized_text": tokenized_text, "labels": labels})
        all_match_results.extend(match_results)

    # Sorting the match results so unmatched entities appear first
    all_match_results.sort(key=lambda x: (x["threshold"] is not None, x["threshold"], x["index"]))

    return processed_data, all_match_results

# Load data from JSON files
sourceData = load_json_file('data/parsed_siirtokarjalaiset.json')
apiResults = load_json_file('apiResponse/all_responses_5976.json')

processed_data, match_results = process_data(sourceData, apiResults, data_limit=200)

# Save the processed data
save_to_json_file(processed_data, 'nerTaggerTrainData123.json')

# Save the reordered match results
save_to_json_file(match_results, 'matchResults.json')



In [1]:
import json

def load_and_print_data(file_path, indexes=None):
    with open(file_path, 'r') as file:
        data = json.load(file)

    for idx, record in enumerate(data):
        if indexes is None or idx in indexes:
            print(f"Index: {idx}")
            print("Tokens and Labels:")
            for token, label in zip(record["tokenized_text"], record["labels"]):
                if label != 'o':  # Filter out tokens labeled as 'o'
                    print(f"Token: {token}, Label: {label}")

            print("\nAll Tokens:")
            print(record["tokenized_text"])
            print(record["labels"])
            print("\n-----------------\n")

# Example usage

file_path = 'nerTaggerTrainData123.json'
load_and_print_data(file_path, indexes=[0,1,2,3,4,5,6,7,8,9])  # Replace with the indexes you want to inspect
# To print all elements, call without the indexes parameter: load_and_print_data(file_path)


Index: 0
Tokens and Labels:
Token: kalastus, Label: B-P-HOB
Token: vesillä, Label: B-P-HOB
Token: liikkumista, Label: I-P-HOB
Token: käsitöitä, Label: B-S-HOB

All Tokens:
['maanviljelijä', ',', 'synt', '.', '28', '.', '11', '.', '-', '09', 'Hi', '##ito', '##lassa', '.', 'Puol', '.', 'Syl', '##vi', 'Maria', 'o', '.', 's', '.', 'Sch', '##v', '##var', '##z', ',', 'emäntä', ',', 'synt', '.', '30', '.', '6', '.', '-', '19', 'Hi', '##ito', '##lassa', '.', 'Avio', '##it', '.', '-', '44', '.', 'Poika', ':', 'Raimo', 'Ensi', '##o', '-', '46', 'Mun', '##sala', '.', 'Asuin', '##p', '.', 'Karjalassa', ':', 'Hi', '##ito', '##la', ',', 'Kilp', '##ola', '-', '40', ',', '42', '—', '44', '.', 'Muut', 'asuin', '##p', '.', ':', 'Park', '##an', '##o', '40', '—', '42', ',', 'Mun', '##sala', '44', '—', '47', ',', 'Ahl', '##aine', '##n', '47', '—', 'Aho', '##kkaat', 'asuvat', '42', 'ha', ':', 'n', 'suuru', '##isella', 'tilalla', ',', 'josta', 'on', 'viljel', '##tyä', '13', ',', '5', 'ha', ',', 'Tila', '##ll

In [None]:
def label_matching_sequence(tokenized_text, untokenized_string):
    labels = ['o'] * len(tokenized_text)

    # Reconstruct the text from tokens for comparison
    reconstructed_text = ''
    token_positions = []  # To store the start and end positions of each token in the reconstructed text

    for token in tokenized_text:
        start = len(reconstructed_text)
        if token.startswith('##'):
            reconstructed_text += token[2:]
        else:
            if reconstructed_text:  # Add a space if not the first token
                reconstructed_text += ' '
            reconstructed_text += token
        end = len(reconstructed_text)
        token_positions.append((start, end))

    # Find the start index of the untokenized string in the reconstructed text
    start_idx = reconstructed_text.find(untokenized_string)

    if start_idx != -1:
        end_idx = start_idx + len(untokenized_string)

        # Iterate over the tokens and label the ones that overlap with the untokenized string
        for i, (token_start, token_end) in enumerate(token_positions):
            if token_start < end_idx and token_end > start_idx:
                labels[i] = 'B' if token_start <= start_idx else 'I'

    return labels

# Testing the function
tokenized_text = ["""

'Rouva', 'Hoh', '##ent', '##hal', 'on', 'eläinsuojel', '##u', '##yhdistyksen', 'perustaja', '-', 'ja', 'kunnia', '##jäsen', 'sekä', 'Pietarsaaren', 'Eläinten', '##ys', '-', 'tä', '##vät', '-', 'nimisen', 'lehden', 'perustaja'

"""]

untokenized_string = "Pietarsaaren Eläintenystävä-lehden perustaja"

labels = label_matching_sequence(tokenized_text, untokenized_string)

# Print tokens and their corresponding labels
tokens_and_labels = [(token, label) for token, label in zip(tokenized_text, labels)]
tokens_and_labels


In [2]:
from transformers import AutoTokenizer
import Levenshtein
import json
import re

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

# Function to load JSON data
def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to save data to JSON file
def save_to_json_file(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# Function to reconstruct text from tokenized form
def reconstruct_text(tokenized_text):
    reconstructed = ""
    for token in tokenized_text:
        if token.startswith("##"):
            reconstructed += token[2:]
        else:
            reconstructed += ' ' + token
    return reconstructed.strip()

# Function to find the best matches for a phrase within a target text
def find_best_matches_char_basis(phrase, target_text, min_similarity_ratio):
    matches = []
    for start_idx in range(len(target_text)):
        for end_idx in range(start_idx + 1, len(target_text) + 1):
            substring = target_text[start_idx:end_idx]
            distance = Levenshtein.distance(phrase, substring)
            max_len = max(len(phrase), len(substring))
            ratio = 1 - (distance / max_len) if max_len > 0 else 0
            if ratio >= min_similarity_ratio:
                matches.append((substring, ratio, start_idx, end_idx))

    matches.sort(key=lambda x: x[1], reverse=True)  # Sort by ratio descending

    return matches


# Function to label matching sequences
def label_matching_sequence(tokenized_text, untokenized_string, labels, label_prefix, reconstructed_text):
    pattern = re.compile(r'\s*'.join(re.escape(word) for word in untokenized_string.split()), re.IGNORECASE)
    matches = pattern.finditer(reconstructed_text)
    for match in matches:
        start_idx, end_idx = match.span()
        print(f"Entity '{untokenized_string}' found at positions {start_idx} to {end_idx} in reconstructed text.")

        # Check if all tokens in the match are currently unlabeled ('o')
        all_unlabeled = True
        for i, token in enumerate(tokenized_text):
            token_start = len(reconstruct_text(tokenized_text[:i]))
            token_end = token_start + len(token.replace("##", ""))
            if token_start < end_idx and token_end > start_idx and labels[i] != 'o':
                all_unlabeled = False
                break

        # Apply labels if all tokens are unlabeled
        if all_unlabeled:
            found_start = False
            for i, token in enumerate(tokenized_text):
                token_start = len(reconstruct_text(tokenized_text[:i]))
                token_end = token_start + len(token.replace("##", ""))
                if token_start < end_idx and token_end > start_idx:
                    label = 'B-' + label_prefix if not found_start else 'I-' + label_prefix
                    found_start = True
                    labels[i] = label
                    print(f"   Token: '{token}' labeled as '{label}'")
            return labels  # Return after successful labeling

    print(f"No suitable match found for '{untokenized_string}' within confidence threshold.")
    return labels



# Main function to process data
def process_data(sourceData, apiResults, start_index, end_index, max_distance_threshold):
    processed_data = []
      
    for source_record in sourceData:
        source_index = source_record["index"]
        # Process records within the specified index range
        if start_index <= source_index <= end_index:
            primary_person_name = source_record["primary_person_name"]
            source_text = primary_person_name + ", " + source_record["source_text"]
            api_record = apiResults[source_index]
            entities = [
                (person_hobby, "P-HOB") for person_hobby in api_record["api_response"].split("\nPersonHobbies: ")[1].split("\n")[0].split(", ")
            ] + [
                (spouse_hobby, "S-HOB") for spouse_hobby in api_record["api_response"].split("\nSpouseHobbies: ")[1].split("\n")[0].split(", ")
            ] + [
                (person_org, "P-ORG") for person_org in api_record["api_response"].split("\nPersonSocialOrgs: ")[1].split("\n")[0].split(", ")
            ] + [
                (spouse_org, "S-ORG") for spouse_org in api_record["api_response"].split("\nSpouseSocialOrgs: ")[1].split("\n")[0].split(", ")
            ]
            
            entities.sort(key=lambda x: len(x[0]), reverse=True)
            # Tokenize and reconstruct text
            tokenized_text = tokenizer.tokenize(source_text)
            reconstructed_text = reconstruct_text(tokenized_text)
            print(f"Reconstructed text: {reconstructed_text}")
            # Initialize labels and processed tokens
            labels = ['o'] * len(tokenized_text)
            processed_tokens = [False] * len(tokenized_text)
            print(f"Entities found: {entities}")
            # Find matches and update labels
            for entity, label_prefix in entities:
                matches = find_best_matches_char_basis(entity, reconstructed_text, max_distance_threshold)
                for match, confidence, _, _ in matches:
                    print(f"Trying match '{match}' for '{entity}' with confidence: {confidence}")
                    new_labels = label_matching_sequence(tokenized_text, match, labels.copy(), label_prefix, reconstructed_text)
                    if new_labels != labels:  # Check if new labels were assigned
                        labels = new_labels
                        print(f"Matched '{match}' for '{entity}'")
                        break  # Stop after finding the first match that leads to labeling


            # Append processed data for the current record
            processed_data.append({
                "index": source_index, 
                "tokenized_text": tokenized_text, 
                "labels": labels
            })
            
    return processed_data

# Example usage of the script
max_distance_threshold = 0.6
sourceData = load_json_file('data/parsed_siirtokarjalaiset.json')
apiResults = load_json_file('apiResponse/all_responses_5976.json')

processed_data = process_data(sourceData, apiResults, 0, 5, max_distance_threshold)

save_to_json_file(processed_data, 'nerTaggerTrainData1234.json')



Reconstructed text: LAURI AHOKAS , maanviljelijä , synt . 28 . 11 . - 09 Hiitolassa . Puol . Sylvi Maria o . s . Schvvarz , emäntä , synt . 30 . 6 . - 19 Hiitolassa . Avioit . - 44 . Poika : Raimo Ensio - 46 Munsala . Asuinp . Karjalassa : Hiitola , Kilpola - 40 , 42 — 44 . Muut asuinp . : Parkano 40 — 42 , Munsala 44 — 47 , Ahlainen 47 — Ahokkaat asuvat 42 ha : n suuruisella tilalla , josta on viljeltyä 13 , 5 ha , Tilalla pidetään karjaa . Lauri Ahokas on sotilasarvoltaan korpraali . Hän palveli pst - miehe - nä ja merivartioasemalla vartiomiehenä talvisodan aikana . Hän harrastaa kalastusta ja vesillä liikkumista . Sylvi Ahokas harrastaa käsitöitä . Lauri Ahokkaan äiti , Hilda o . s . Viinanen , synt . v . - 89 Hiitolassa , asuu poikansa luona
Entities found: [('vesillä liikkuminen', 'P-HOB'), ('kalastus', 'P-HOB'), ('käsityöt', 'S-HOB'), ('', 'P-ORG'), ('', 'S-ORG')]
Trying match 'vesillä liikkumi' for 'vesillä liikkuminen' with confidence: 0.8421052631578947
Entity 'vesillä liikku

In [2]:
import json

def process_labels_and_remove_ner_tags(data):
    last_modification = None

    for item in data:
        # Remove the 'ner_tags' element if it exists
        if 'ner_tags' in item:
            del item['ner_tags']

        tokenized_text = item['tokenized_text']
        labels = item['labels']

        # Iterate through each token, starting from the second one
        for i in range(1, len(tokenized_text)):
            if tokenized_text[i].startswith("##"):
                prev_label = labels[i - 1]

                # If the previous token is a 'B-' label, change the current label to corresponding 'I-' label
                if prev_label.startswith("B-"):
                    old_label = labels[i]
                    labels[i] = "I-" + prev_label[2:]
                    last_modification = f"Last label update: Token '{tokenized_text[i]}', from '{old_label}' to '{labels[i]}'"

    return data, last_modification

def save_data_to_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def load_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Load your data
data = load_data_from_json('nerTaggerTrainData10000withNames.json')

# Process the data and get the last modification
processed_data, last_modification = process_labels_and_remove_ner_tags(data)

# Print the last modification
print(last_modification)

# Save the processed data to a JSON file
save_data_to_json('nerTaggerTrainData10000withNamesProcessed.json', processed_data)



Last label update: Token '##seuran', from 'I-S-ORG' to 'I-S-ORG'


In [8]:
# Define the label mapping
label_map = {'o': 0, 'B-P-ORG': 1, 'I-P-ORG': 2, 'B-P-HOB': 3, 'I-P-HOB': 4, 'B-S-ORG': 5, 'I-S-ORG': 6, 'B-S-HOB': 7, 'I-S-HOB': 8}


import json

def process_labels_and_remove_ner_tags(data, label_map):
    count = 0
    for item in data:
        # Print and remove the 'ner_tags' element if it exists
        if 'ner_tags' in item:
            #print("Before removing 'ner_tags':", item['ner_tags'])
            del item['ner_tags']
            #print("After removing 'ner_tags':", item)

        tokenized_text = item['tokenized_text']
        labels = item['labels']

        # Iterate through each token, starting from the second one
        for i in range(1, len(tokenized_text)):
            if tokenized_text[i].startswith("##"):
                prev_label = labels[i - 1]

                # If the previous token is a 'B-' label, change the current label to corresponding 'I-' label
                if prev_label.startswith("B-"):
                    old_label = labels[i]
                    labels[i] = "I-" + prev_label[2:]
                    count = count +1
                  #  print(f"Label updated for token '{tokenized_text[i]}': '{old_label}' to '{labels[i]}'")
    print(count)
    return data

def save_data_to_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def load_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Load your data
data = load_data_from_json('nerTaggerTrainData5900withNames.json')

# Process the data
processed_data = process_labels_and_remove_ner_tags(data, label_map)

# Save the processed data to a JSON file
save_data_to_json('nerTaggerTrainData5900withNamesProcessed.json', processed_data)




13713


In [2]:
import random

def print_random_stories(data, num_stories=3):
    # Select random stories
    random_stories = random.sample(data, num_stories)

    for story in random_stories:
        tokenized_text = story['tokenized_text']
        labels = story['labels']

        # Check if lengths match
        if len(tokenized_text) != len(labels):
            print("Warning: Mismatched lengths of tokens and labels.")
            continue

        print("Story:")
        for token, label in zip(tokenized_text, labels):
            print(f"{token} ({label})", end=' ')
        print("\n" + "-"*50)

# Load your data
data = load_data_from_json('nerTaggerTrainData5900withNamesProcessed.json')

# Print random stories
print_random_stories(data, num_stories=5)


Story:
Anni (o) Luu (o) ##kkonen (o) , (o) talo (o) ##llisen (o) tytär (o) , (o) synt (o) . (o) 7 (o) . (o) 8 (o) . (o) - (o) 97 (o) Pyhä (o) ##järvellä (o) . (o) Asuin (o) ##p (o) . (o) Karjalassa (o) : (o) Pyhä (o) ##järvi (o) V (o) ##pl (o) . (o) Kon (o) ##nit (o) ##sa (o) - (o) 39 (o) , (o) 42 (o) — (o) 44 (o) . (o) Muut (o) asuin (o) ##p (o) . (o) : (o) Ala (o) ##vus (o) . (o) Vil (o) ##ppula (o) - (o) 44 (o) , (o) Hämeen (o) ##kyr (o) ##ö (o) 46 (o) — (o) . (o) Anni (o) Luu (o) ##kkonen (o) asuu (o) veljensä (o) . (o) Heikin (o) luona (o) Hän (o) on (o) harras (o) uskovainen (o) , (o) kuulu (o) ##en (o) la (B-P-ORG) ##estadio (I-P-ORG) ##laisiin (I-P-ORG) . (o) Hänen (o) harrastuksen (o) ##aan (o) ovat (o) pääasiassa (o) käsityö (B-P-HOB) ##t (I-P-HOB) , (o) Anni (o) Luu (o) ##kkosen (o) vanhemmat (o) Heikki (o) ja (o) Var (o) ##pu (o) Luu (o) ##kkonen (o) ovat (o) kuolleet (o) , (o) isä (o) Hämeen (o) ##kyr (o) ##össä (o) ja (o) äiti (o) Savonlinnassa (o) eva (o) ##kko (o) ##mat

In [2]:
import json

def check_labels(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    inconsistent_entries = []

    for entry in data:
        tokens = entry['tokenized_text']
        labels = entry['labels']
        
        if len(tokens) != len(labels):
            inconsistent_entries.append((tokens, labels, 'Mismatched lengths'))
            continue

        current_label = None
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label.startswith('B-') or label == 'o':
                current_label = label
            elif label.startswith('I-') and current_label != label.replace('I-', 'B-'):
                inconsistent_entries.append((tokens, labels, f'Inconsistent labeling at token {i}: {token}'))
                break

    return inconsistent_entries

# Use the function like this
inconsistencies = check_labels('nerTaggerTrainData5900withNames.json')
print(inconsistencies)


[]
