# Imports

This cell contains all the required **libraries and dependencies**
needed for preprocessing, tokenization, and further analysis.

In [None]:
from pathlib import Path
from conllu import parse
import re
import regex
import pandas as pd


# General Functions

This cell defines the **helper functions** used throughout the notebook.
They provide reusable functionality to support preprocessing, tokenization,
and other tasks required in later steps.

In [None]:
def normalize_quotes(text):
    replacements = {
        '"': '"',
        '“': '"',
        '”': '"',
        '‘': "'",
        '’': "'",
        "'": "'"
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def is_punctuation(text):
    return all(not c.isalnum() for c in text)


def is_number(word: str) -> bool:
    return bool(re.fullmatch(r'\d+', word))


def clean_word(word: str) -> str:
    return regex.sub(r'[^\p{L}]', '', word)


def clean_number(word: str) -> str:
    return re.sub(r'[^0-9]', '', word)

In [None]:
def count_sentences_pos(conllu_dir):
    def count_sentences_in_file(file_path):
        try:
            with Path(file_path).open(encoding="utf-8") as f:
                data = f.read()

            sentences = parse(data)
            return len(sentences)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return 0

    total_sentences = 0

    for i in range(1, 10):
        subdir = conllu_dir / f"{i}Part"
        if subdir.exists() and subdir.is_dir():
            for conllu_file in subdir.glob("*.conllu"):
                count = count_sentences_in_file(conllu_file)
                total_sentences += count

    return total_sentences


def count_sentences_ner(file_path):
    try:
        with Path(file_path).open(encoding="utf-8") as f:
            lines = f.readlines()

        sentence_count = sum(1 for line in lines if line.strip() == "")
        return sentence_count
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0

In [None]:
def read_ner_data_and_assign_ids(file_path):
    try:
        with Path(file_path).open(encoding="utf-8") as f:
            lines = f.readlines()

        sentences = []
        current_sentence = []
        sentence_id = 1

        for line in lines:
            line = line.strip()
            if line == "":
                if current_sentence:
                    sentences.append((sentence_id, current_sentence))
                    current_sentence = []
                    sentence_id += 1
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[-1]
                    current_sentence.append((token, label))

        if current_sentence:
            sentences.append((sentence_id, current_sentence))

        return sentences
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []


def read_pos_data_and_assign_ids(conllu_dir):
    sentences = []
    sentence_id = 1

    for i in range(1, 10):
        subdir = conllu_dir / f"{i}Part"
        if subdir.exists() and subdir.is_dir():
            for conllu_file in subdir.glob("*.conllu"):
                try:
                    with conllu_file.open(encoding="utf-8") as f:
                        data = f.read()

                    parsed_sentences = parse(data)
                    for sent in parsed_sentences:
                        tokens = [

                            (normalize_quotes(token['form']),
                             normalize_quotes(token['lemma']),
                             token['upostag'],
                             token['feats'],
                             token['head'],
                             token['deprel'],
                             token['deps'],
                             token['misc'])

                            for token in sent
                        ]
                        sentences.append((sentence_id, tokens))
                        sentence_id += 1
                except Exception as e:
                    print(f"Error processing {conllu_file}: {e}")

    return sentences

In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
pos_path = Path("../../Conllu Files in Corpus/")

In [None]:
ner_sentences_count = count_sentences_ner(ner_path)
pos_sentences_count = count_sentences_pos(pos_path)

In [None]:
print(f"NER sentences: {ner_sentences_count}")
print(f"POS sentences: {pos_sentences_count}")

In [None]:
ner_sentences = read_ner_data_and_assign_ids(ner_path)
print(ner_sentences[0])

In [None]:
pos_sentences = read_pos_data_and_assign_ids(pos_path)
print(pos_sentences[0])

# Initial Tokenization

This cell performs **tokenization** on each file.
All files are processed, even if their tokens do not align perfectly.
These results will later be leveraged for an additional round of tokenization.

In [None]:
def read_ner_corpus(path):
    ner_words = []

    with Path(path).open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = [part.strip() for part in line.split("\t") if part.strip()]
            if len(parts) >= 2:

                clean_word = parts[0].replace(" ", "")
                if clean_word:  # Only add non-empty words
                    ner_words.append([clean_word, parts[1]])  # [word, tag]

    return ner_words

def read_text_files_and_match(text_dir: Path, ner_words, output_dir: Path):
    remaining_corpus = ner_words.copy()
    output_dir.mkdir(parents=True, exist_ok=True)
    pos_index = 0
    subdir = text_dir / f"7Part"
    if subdir.exists() and subdir.is_dir():
        files = sorted(subdir.glob("*.txt"), key=lambda f: int(f.stem.split("_")[0]))

        for txt_file in files:
            print(f"\nProcessing {txt_file} ...")

            try:
                with txt_file.open(encoding="utf-8") as f:
                    raw_sentences = [line.strip() for line in f if line.strip()]

                matched_lines = []

                for raw in raw_sentences:
                    space_tokens = [token.strip() for token in raw.split() if token.strip()]
                    if not space_tokens:
                        continue

                    all_tokens = []
                    for token in space_tokens:
                        current_token = ""
                        for char in token:
                            if char.isalnum() or char == '-':
                                current_token += char
                            else:
                                if current_token:
                                    all_tokens.append(current_token)
                                    current_token = ""
                                all_tokens.append(char)
                        if current_token:
                            all_tokens.append(current_token)

                    sentence_tokens = []

                    for token in all_tokens:
                        if pos_index < len(remaining_corpus):
                            corpus_word, tag = remaining_corpus[pos_index]

                            if token.lower() == corpus_word.lower():
                                sentence_tokens.append(f"{token}\t{tag}")
                                pos_index += 1
                            else:
                                sentence_tokens.append(f"{token}\tO")
                                print(f"Mismatch: token '{token}' vs corpus '{corpus_word}' at position {pos_index}")
                        else:
                            sentence_tokens.append(f"{token}\tO")

                    matched_lines.extend(sentence_tokens)
                    matched_lines.append("")

                out_subdir = output_dir / f"7Part2"
                out_subdir.mkdir(parents=True, exist_ok=True)
                out_file = out_subdir / txt_file.name
                with out_file.open("w", encoding="utf-8") as f:
                    f.write("\n".join(matched_lines))

                print(f" -> Wrote {len(matched_lines)} lines to {out_file}")

            except Exception as e:
                print(f"Error processing {txt_file}: {e}")

    remaining_corpus = remaining_corpus[pos_index:]
    return remaining_corpus

In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
text_dir = Path("../../Text Files in Corpus/")
output_dir = Path("../../Matched Text Files/")

ner_words = read_ner_corpus(ner_path)
remaining = read_text_files_and_match(text_dir, ner_words, output_dir)

print(f"\nDone. Remaining corpus tokens: {len(remaining)}")

# Final Tokenization

This cell performs the **final round of tokenization**.
Each token extracted from the corpus is written to its respective file.
This ensures a complete and organized representation of the data.

In [None]:
def get_corpus_data(path):
    ner_words = []
    with Path(path).open(encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            line = line.replace("\t\t'", "\t")
            parts = [normalize_quotes(part.strip()) for part in line.split("\t") if part.strip()]
            if len(parts) >= 2:
                ner_words.append(parts[:2])
    return ner_words

def fix_the_ner_data_per_file(text_dir: Path, corpus_tokens, output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)
    corpus_index = 0  # track position in corpus

    for i in range(1, 11):
        subdir = text_dir / f"{i}Part"
        if not subdir.exists() or not subdir.is_dir():
            continue

        files = sorted(subdir.glob("*.txt"), key=lambda f: int(f.stem.split("_")[0]))
        for file in files:
            fixed_lines = []
            recon_word = ""

            file_ner_words = get_corpus_data(file)

            for ner_word_info in file_ner_words:
                ner_word = normalize_quotes(ner_word_info[0])

                if corpus_index >= len(corpus_tokens):
                    break

                corpus_word, corpus_tag = corpus_tokens[corpus_index]
                corpus_word = normalize_quotes(corpus_word)

                if ner_word.lower() == corpus_word.lower():
                    fixed_lines.append(f"{corpus_word}\t\t{corpus_tag}")
                    corpus_index += 1
                    recon_word = ""
                    continue

                recon_word += ner_word
                if recon_word.lower() == corpus_word.lower():
                    fixed_lines.append(f"{corpus_word}\t\t{corpus_tag}")
                    corpus_index += 1
                    recon_word = ""
                elif not corpus_word.startswith(recon_word):
                    recon_word = ""

            if len(fixed_lines) == 0:
                if corpus_index < len(corpus_tokens):
                    corpus_word, corpus_tag = corpus_tokens[corpus_index]
                    print(f"STOPPED: No matches found in file {file.name}. Next corpus word: {corpus_word}")
                else:
                    print(f"STOPPED: No matches found in file {file.name}. Corpus exhausted.")
                return

            out_subdir = output_dir / f"{i}Part"
            out_subdir.mkdir(parents=True, exist_ok=True)
            out_file = out_subdir / file.name
            with out_file.open("w", encoding="utf-8") as f:
                f.write("\n".join(fixed_lines))

            print(f"Wrote {len(fixed_lines)} lines to {out_file}")

    print(f"Done. Remaining corpus tokens: {len(corpus_tokens) - corpus_index}")


In [None]:
ner_path = Path("../../Corpus/korpusi.txt")
text_dir = Path("../../Matched Text Files2/")
output_dir = Path("../../Replaced Text Files/")

In [None]:
ner_words = get_corpus_data(ner_path)
fix_the_ner_data_per_file(text_dir, ner_words, output_dir)

# Dataset Matching

This cell handles the **alignment and matching** of datasets.
It ensures that the data sources are properly synchronized
before moving on to tokenization and further processing.