# Imports

This cell contains all the required **libraries and dependencies**
needed for preprocessing, tokenization, and further analysis.

In [1]:
import os
import re
from concurrent.futures import ThreadPoolExecutor
from difflib import SequenceMatcher
from pathlib import Path

import pandas as pd
import regex
from conllu import parse


# General Functions

This cell defines the **helper functions** used throughout the notebook.
They provide reusable functionality to support preprocessing, tokenization,
and other tasks required in later steps.

In [2]:
def normalize_quotes(text):
    replacements = {
        '"': '"',
        '“': '"',
        '”': '"',
        '‘': "'",
        '’': "'",
        "‘’": '"',
        "’‘": '"',
        "`": "'",
        "``": '"',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def is_punctuation(text):
    return all(not c.isalnum() for c in text)


def is_number(word: str) -> bool:
    return bool(re.fullmatch(r'\d+', word))


def clean_word(word: str) -> str:
    return regex.sub(r'[^\p{L}]', '', word)


def clean_number(word: str) -> str:
    return re.sub(r'[^0-9]', '', word)

In [None]:
def count_sentences_pos(conllu_dir):
    def count_sentences_in_file(file_path):
        try:
            with Path(file_path).open(encoding="utf-8") as f:
                data = f.read()

            sentences = parse(data)
            return len(sentences)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return 0

    total_sentences = 0

    for i in range(1, 10):
        subdir = conllu_dir / f"{i}Part"
        if subdir.exists() and subdir.is_dir():
            for conllu_file in subdir.glob("*.conllu"):
                count = count_sentences_in_file(conllu_file)
                total_sentences += count

    return total_sentences


def count_sentences_ner(file_path):
    try:
        with Path(file_path).open(encoding="utf-8") as f:
            lines = f.readlines()

        sentence_count = sum(1 for line in lines if line.strip() == "")
        return sentence_count
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0

In [None]:
def read_ner_data_and_assign_ids(file_path):
    try:
        with Path(file_path).open(encoding="utf-8") as f:
            lines = f.readlines()

        sentences = []
        current_sentence = []
        sentence_id = 1

        for line in lines:
            line = line.strip()
            if line == "":
                if current_sentence:
                    sentences.append((sentence_id, current_sentence))
                    current_sentence = []
                    sentence_id += 1
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[-1]
                    current_sentence.append((token, label))

        if current_sentence:
            sentences.append((sentence_id, current_sentence))

        return sentences
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []


def read_pos_data_and_assign_ids(conllu_dir):
    sentences = []
    sentence_id = 1

    for i in range(1, 10):
        subdir = conllu_dir / f"{i}Part"
        if subdir.exists() and subdir.is_dir():
            for conllu_file in subdir.glob("*.conllu"):
                try:
                    with conllu_file.open(encoding="utf-8") as f:
                        data = f.read()

                    parsed_sentences = parse(data)
                    for sent in parsed_sentences:
                        tokens = [

                            (normalize_quotes(token['form']),
                             normalize_quotes(token['lemma']),
                             token['upostag'],
                             token['feats'],
                             token['head'],
                             token['deprel'],
                             token['deps'],
                             token['misc'])

                            for token in sent
                        ]
                        sentences.append((sentence_id, tokens))
                        sentence_id += 1
                except Exception as e:
                    print(f"Error processing {conllu_file}: {e}")

    return sentences

In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
pos_path = Path("../../Conllu Files in Corpus/")

In [None]:
ner_sentences_count = count_sentences_ner(ner_path)
pos_sentences_count = count_sentences_pos(pos_path)

In [None]:
print(f"NER sentences: {ner_sentences_count}")
print(f"POS sentences: {pos_sentences_count}")

In [None]:
ner_sentences = read_ner_data_and_assign_ids(ner_path)
print(ner_sentences[0])

In [None]:
pos_sentences = read_pos_data_and_assign_ids(pos_path)
print(pos_sentences[0])

# Initial Tokenization

This cell performs **tokenization** on each file.
All files are processed, even if their tokens do not align perfectly.
These results will later be leveraged for an additional round of tokenization.

In [None]:
def read_ner_corpus(path):
    ner_words = []

    with Path(path).open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = [part.strip() for part in line.split("\t") if part.strip()]
            if len(parts) >= 2:

                clean_word = parts[0].replace(" ", "")
                if clean_word:  # Only add non-empty words
                    ner_words.append([clean_word, parts[1]])  # [word, tag]

    return ner_words


def read_text_files_and_match(text_dir: Path, ner_words, output_dir: Path):
    remaining_corpus = ner_words.copy()
    output_dir.mkdir(parents=True, exist_ok=True)
    pos_index = 0
    subdir = text_dir / f"7Part"
    if subdir.exists() and subdir.is_dir():
        files = sorted(subdir.glob("*.txt"), key=lambda f: int(f.stem.split("_")[0]))

        for txt_file in files:
            print(f"\nProcessing {txt_file} ...")

            try:
                with txt_file.open(encoding="utf-8") as f:
                    raw_sentences = [line.strip() for line in f if line.strip()]

                matched_lines = []

                for raw in raw_sentences:
                    space_tokens = [token.strip() for token in raw.split() if token.strip()]
                    if not space_tokens:
                        continue

                    all_tokens = []
                    for token in space_tokens:
                        current_token = ""
                        for char in token:
                            if char.isalnum() or char == '-':
                                current_token += char
                            else:
                                if current_token:
                                    all_tokens.append(current_token)
                                    current_token = ""
                                all_tokens.append(char)
                        if current_token:
                            all_tokens.append(current_token)

                    sentence_tokens = []

                    for token in all_tokens:
                        if pos_index < len(remaining_corpus):
                            corpus_word, tag = remaining_corpus[pos_index]

                            if token.lower() == corpus_word.lower():
                                sentence_tokens.append(f"{token}\t{tag}")
                                pos_index += 1
                            else:
                                sentence_tokens.append(f"{token}\tO")
                                print(f"Mismatch: token '{token}' vs corpus '{corpus_word}' at position {pos_index}")
                        else:
                            sentence_tokens.append(f"{token}\tO")

                    matched_lines.extend(sentence_tokens)
                    matched_lines.append("")

                out_subdir = output_dir / f"7Part2"
                out_subdir.mkdir(parents=True, exist_ok=True)
                out_file = out_subdir / txt_file.name
                with out_file.open("w", encoding="utf-8") as f:
                    f.write("\n".join(matched_lines))

                print(f" -> Wrote {len(matched_lines)} lines to {out_file}")

            except Exception as e:
                print(f"Error processing {txt_file}: {e}")

    remaining_corpus = remaining_corpus[pos_index:]
    return remaining_corpus

In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
text_dir = Path("../../Text Files in Corpus/")
output_dir = Path("../../Matched Text Files/")

ner_words = read_ner_corpus(ner_path)
remaining = read_text_files_and_match(text_dir, ner_words, output_dir)

print(f"\nDone. Remaining corpus tokens: {len(remaining)}")

# Final Tokenization

This cell performs the **final round of tokenization**.
Each token extracted from the corpus is written to its respective file.
This ensures a complete and organized representation of the data.

In [None]:
def get_corpus_data(path):
    ner_words = []
    with Path(path).open(encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            line = line.replace("\t\t'", "\t")
            parts = [normalize_quotes(part.strip()) for part in line.split("\t") if part.strip()]
            if len(parts) >= 2:
                ner_words.append(parts[:2])
    return ner_words


def fix_the_ner_data_per_file(text_dir: Path, corpus_tokens, output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)
    corpus_index = 0  # track position in corpus

    for i in range(1, 11):
        subdir = text_dir / f"{i}Part"
        if not subdir.exists() or not subdir.is_dir():
            continue

        files = sorted(subdir.glob("*.txt"), key=lambda f: int(f.stem.split("_")[0]))
        for file in files:
            fixed_lines = []
            recon_word = ""

            file_ner_words = get_corpus_data(file)

            for ner_word_info in file_ner_words:
                ner_word = normalize_quotes(ner_word_info[0])

                if corpus_index >= len(corpus_tokens):
                    break

                corpus_word, corpus_tag = corpus_tokens[corpus_index]
                corpus_word = normalize_quotes(corpus_word)

                if ner_word.lower() == corpus_word.lower():
                    fixed_lines.append(f"{corpus_word}\t\t{corpus_tag}")
                    corpus_index += 1
                    recon_word = ""
                    continue

                recon_word += ner_word
                if recon_word.lower() == corpus_word.lower():
                    fixed_lines.append(f"{corpus_word}\t\t{corpus_tag}")
                    corpus_index += 1
                    recon_word = ""
                elif not corpus_word.startswith(recon_word):
                    recon_word = ""

            if len(fixed_lines) == 0:
                if corpus_index < len(corpus_tokens):
                    corpus_word, corpus_tag = corpus_tokens[corpus_index]
                    print(f"STOPPED: No matches found in file {file.name}. Next corpus word: {corpus_word}")
                else:
                    print(f"STOPPED: No matches found in file {file.name}. Corpus exhausted.")
                return

            out_subdir = output_dir / f"{i}Part"
            out_subdir.mkdir(parents=True, exist_ok=True)
            out_file = out_subdir / file.name
            with out_file.open("w", encoding="utf-8") as f:
                f.write("\n".join(fixed_lines))

            print(f"Wrote {len(fixed_lines)} lines to {out_file}")

    print(f"Done. Remaining corpus tokens: {len(corpus_tokens) - corpus_index}")


In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
text_dir = Path("../../Matched Text Files2/")
output_dir = Path("../../Replaced Text Files/")

In [None]:
ner_words = get_corpus_data(ner_path)
fix_the_ner_data_per_file(text_dir, ner_words, output_dir)

# Dataset Matching

This cell handles the **alignment and matching** of datasets.
It ensures that the data sources are properly synchronized
before moving on to tokenization and further processing.

In [3]:
def fuzzy_ratio(a, b):
    """Return similarity ratio between two strings."""
    return SequenceMatcher(None, a, b).ratio()


def strip_punct(text):
    return re.sub(r'^[^\w]+|[^\w]+$', '', text)


def strip_punct_alternative(text):
    import string
    return text.strip(string.punctuation + '()[]{}""''«»‹›„"‚')


def split_punct_tokens(tokens):
    new_tokens = []
    for t in tokens:
        word = str(t['WORD'])
        lemma = str(t.get('LEMMA', word))

        parts = re.findall(r'\w+|[^\w\s]', word)

        if len(parts) == 1:
            new_tokens.append(t)
        else:
            for p in parts:
                new_t = t.copy()
                new_t['WORD'] = p

                if re.match(r'[^\w\s]', p):
                    new_t['LEMMA'] = p
                elif p.isdigit():  #
                    new_t['LEMMA'] = p
                else:  # words
                    new_t['LEMMA'] = strip_punct(lemma)

                if re.match(r'[^\w\s]', p):
                    new_t['POS_TAG'] = "PUNCT"
                    new_t['FEATS'] = "None"
                    new_t['DEPREL'] = "punct"
                elif p.isdigit():  # numbers
                    new_t['POS_TAG'] = "NUM"
                    new_t['FEATS'] = "{'NumType': 'Card'}"
                    new_t['DEPREL'] = t.get('DEPREL', "nummod")
                else:
                    pass

                new_tokens.append(new_t)

    return new_tokens


def normalize_for_matching(tokens):
    text = ' '.join([str(t['WORD']).strip() for t in tokens])
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def fuzzy_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [4]:
def get_ner_data(path):
    ner_words = []
    with Path(path).open(encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            line = line.replace("\t\t'", "\t")
            parts = [normalize_quotes(part.strip()) for part in line.split("\t") if part.strip()]
            if len(parts) >= 2:
                ner = {
                    "WORD": parts[0],
                    "NER_TAG": parts[1]
                }
                ner_words.append(ner)
    return ner_words


def get_pos_data(path):
    pos_words = []

    with Path(path).open(encoding="utf-8") as f:
        data = f.read()

    sentences = parse(data)
    for sentence in sentences:
        for token in sentence:
            pos = {
                "WORD": normalize_quotes(token["form"]),
                "LEMMA": normalize_quotes(token["lemma"]),
                "POS_TAG": token.get("upostag"),
                "FEATS": token.get("feats"),
                "HEAD": token.get("head"),
                "DEPREL": token.get("deprel"),
                "DEPS": token.get("deps"),
                "MISC": token.get("misc"),

            }
            pos_words.append(pos)

    return pos_words

In [5]:
def align_ner_to_pos_dp_split(ner_data, pos_data, threshold=0.8, max_span=1):
    ner_data_split = split_punct_tokens(ner_data)
    pos_data_split = split_punct_tokens(pos_data)
    dummy_token = {
        'WORD': '__DUMMY__',
        'POS_TAG': 'X',
        'LEMMA': '__DUMMY__',
        'FEATS': '_',
        'HEAD': '_',
        'DEPREL': '_',
        'DEPS': '_',
        'MISC': '_'
    }
    pos_data_split = [dummy_token] + pos_data_split

    n = len(ner_data_split)
    m = len(pos_data_split)

    dp = [[0] * (m + 1) for _ in range(n + 1)]
    back = [[None] * (m + 1) for _ in range(n + 1)]

    def get_adaptive_max_span(ner_word, pos_start_idx):
        base_span = max_span
        if ner_word.replace(' ', '').isalnum() and len(ner_word) <= 10:
            consecutive_short = 0
            for k in range(pos_start_idx, min(pos_start_idx + 6, m)):
                pos_word = str(pos_data_split[k]['WORD']).strip()
                if len(pos_word) <= 3 and (pos_word.isalnum() or pos_word.isdigit()):
                    consecutive_short += 1
                else:
                    break
            if consecutive_short >= 2:
                return min(consecutive_short, 5)
        return base_span

    for i in range(n + 1):
        for j in range(m + 1):
            if i > 0 and dp[i][j] < dp[i - 1][j]:
                dp[i][j] = dp[i - 1][j]
                back[i][j] = (i - 1, j)
            if j > 0 and dp[i][j] < dp[i][j - 1]:
                dp[i][j] = dp[i][j - 1]
                back[i][j] = (i, j - 1)
            if i > 0 and j < m:
                ner_word = str(ner_data_split[i - 1]['WORD']).strip()
                adaptive_max_span = get_adaptive_max_span(ner_word, j)
                for span in range(1, adaptive_max_span + 1):
                    if j + span > m:
                        break
                    candidate = normalize_for_matching(pos_data_split[j:j + span])
                    score = fuzzy_ratio(ner_word, candidate)
                    if score >= threshold:
                        match_score = dp[i - 1][j] + 1
                        if match_score > dp[i][j + span - 1]:
                            dp[i][j + span - 1] = match_score
                            back[i][j + span - 1] = (i - 1, j, span)
                        elif match_score == dp[i][j + span - 1] and span > 1:
                            if back[i][j + span - 1] and len(back[i][j + span - 1]) == 3:
                                current_span = back[i][j + span - 1][2]
                                if span > current_span:
                                    back[i][j + span - 1] = (i - 1, j, span)

    i, j = n, m
    aligned_pairs = []
    unmatched_ner = []

    while i > 0 and j > 0:
        if back[i][j] is None:
            break
        prev = back[i][j]
        if len(prev) == 3:
            pi, pj, span = prev
            pos_tokens = pos_data_split[pj:pj + span]
            aligned_pairs.append((ner_data_split[pi], pos_tokens))
            i, j = pi, pj
        else:
            pi, pj = prev
            if pi == i - 1 and pj == j:
                unmatched_ner.append(ner_data_split[i - 1]['WORD'])
            i, j = pi, pj

    aligned_pairs.reverse()
    data = []

    for ner_token, pos_tokens in aligned_pairs:
        combined_word = ''.join([t['WORD'] for t in pos_tokens])
        lemmas = [str(t['LEMMA']) for t in pos_tokens]
        if len(pos_tokens) > 1 and all(
                len(str(t['WORD']).strip()) <= 3 and str(t['WORD']).strip().isalnum() for t in pos_tokens):
            combined_lemma = ''.join(lemmas)
        else:
            combined_lemma = ' '.join(lemmas)
        first_token = pos_tokens[0]
        combined = {
            "WORD": combined_word,
            "NER_TAG": ner_token["NER_TAG"],
            "POS_TAG": str(first_token['POS_TAG']),
            "LEMMA": combined_lemma,
            "FEATS": str(first_token['FEATS']),
            "HEAD": str(first_token['HEAD']),
            "DEPREL": str(first_token['DEPREL']),
            "DEPS": str(first_token['DEPS']),
            "MISC": str(first_token['MISC']),
        }
        data.append(combined)

    if unmatched_ner:
        print(f"Unmatched NER tokens ({len(unmatched_ner)}):", unmatched_ner)

    return data

In [6]:
def natural_key(filename):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', filename)]


def extract_number_and_suffix(filename):
    match = re.match(r"(\\d+)(?:_data.*?headline(\\d+))?", filename)
    if match:
        main_num = int(match.group(1))
        suffix = int(match.group(2)) if match.group(2) else 0
        return (main_num, suffix)
    match = re.match(r"(\\d+)(?:_(\\d+))?\\.conllu", filename)
    if match:
        main_num = int(match.group(1))
        suffix = int(match.group(2)) if match.group(2) else 0
        return (main_num, suffix)
    return (0, 0)


def process_file_pair(ner_file, pos_file):
    ner_data = get_ner_data(ner_file)
    pos_data = get_pos_data(pos_file)
    dataset = align_ner_to_pos_dp_split(ner_data, pos_data)
    print(f"Processed {ner_file.name} and {pos_file.name}")
    return dataset


In [8]:
def merge_both_datasets(ner_path, pos_path):
    final_dataset = []
    file_pairs = []
    for i in range(1, 11):
        ner_subdir = ner_path / f"{i}Part"
        pos_subdir = pos_path / f"{i}Part"
        ner_files = sorted(ner_subdir.glob("*.txt"), key=lambda f: natural_key(f.name))
        pos_files = sorted(pos_subdir.glob("*.conllu"), key=lambda f: natural_key(f.name))
        for ner_file, pos_file in zip(ner_files, pos_files):
            dataset = file_pairs.append((ner_file, pos_file))
            final_dataset.append(dataset)
    return pd.concat(final_dataset, ignore_index=True)


def merge_both_datasets_with_threads(ner_path, pos_path):
    final_dataset = []
    file_pairs = []
    # for i in range(1, 11):
    ner_subdir = ner_path / f"6Part"
    pos_subdir = pos_path / f"6Part"
    ner_files = sorted(ner_subdir.glob("*.txt"), key=lambda f: natural_key(f.name))
    pos_files = sorted(pos_subdir.glob("*.conllu"), key=lambda f: natural_key(f.name))
    for ner_file, pos_file in zip(ner_files, pos_files):
        file_pairs.append((ner_file, pos_file))
    max_workers = os.cpu_count() * 4
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_file_pair, ner_file, pos_file) for ner_file, pos_file in file_pairs]
        for future in futures:
            result = future.result()
            if isinstance(result, list):
                result = pd.DataFrame(result)
            final_dataset.append(result)
    return pd.concat(final_dataset, ignore_index=True)

In [9]:
ner_path = Path("../../Corpus/Files/NER Files in Corpus")
pos_path = Path("../../Corpus/Files/POS Files in Corpus")

dataset = merge_both_datasets_with_threads(ner_path, pos_path)

Processed 9_data_lajme_rtsh_al_tech_headline.txt and 9.conllu
Processed 2_data_lajme_rtsh_al_tech_headline.txt and 2.conllu
Processed 14_data_lajme_rtsh_al_tech_headline.txt and 14.conllu
Processed 10_data_lajme_rtsh_al_tech_headline.txt and 10.conllu
Processed 16_data_lajme_rtsh_al_tech_headline.txt and 16.conllu
Processed 23_data_lajme_rtsh_al_tech_headline.txt and 23.conllu
Processed 6_data_lajme_rtsh_al_tech_headline.txt and 6.conllu
Processed 12_data_lajme_rtsh_al_tech_headline.txt and 12.conllu
Processed 19_data_lajme_rtsh_al_tech_headline.txt and 19.conllu
Processed 17_data_lajme_rtsh_al_tech_headline.txt and 17.conllu
Processed 22_data_lajme_rtsh_al_tech_headline.txt and 22.conllu
Processed 13_data_lajme_rtsh_al_tech_headline.txt and 13.conllu
Processed 5_data_lajme_rtsh_al_tech_headline.txt and 5.conllu
Processed 3_data_lajme_rtsh_al_tech_headline.txt and 3.conllu
Processed 24_data_lajme_rtsh_al_tech_headline.txt and 24.conllu
Processed 21_data_lajme_rtsh_al_tech_headline.txt 

In [10]:
dataset.to_csv("output.csv", index=False)