In [1]:
from collections import Counter
import os
from tqdm import tqdm
from proiel_ud_maps import pos_map, translate_morph

def convert_file(infile, outfile, global_pos, global_feats):
    with open(infile, encoding="utf-8") as fin, open(outfile, "w", encoding="utf-8") as fout:
        for line in fin:
            if not line.strip() or line.startswith("#"):
                fout.write(line)
                continue

            fields = line.strip().split("\t")
            if len(fields) != 10:  # skip malformed
                fout.write(line)
                continue

            # POS
            short_pos = fields[3]
            upos = pos_map.get(short_pos)
            if not upos:
                global_pos[short_pos] += 1
                upos = "X"
            fields[3] = upos

            # Morph
            fields[5] = translate_morph(fields[5], global_feats)

            fout.write("\t".join(fields) + "\n")


def convert_folder(infolder, outfolder):
    os.makedirs(outfolder, exist_ok=True)
    global_pos = Counter()
    global_feats = Counter()

    for fname in tqdm(os.listdir(infolder)):
        if fname.endswith(".conllu"):
            infile = os.path.join(infolder, fname)
            outfile = os.path.join(outfolder, fname)
            convert_file(infile, outfile, global_pos, global_feats)

    # Final report
    print("\n=== Global Conversion Report ===")
    print(f"Unknown POS tags: {sum(global_pos.values())}")
    for tag, count in global_pos.most_common():
        print(f"  POS {tag}: {count}")

    print(f"\nUnknown features: {sum(global_feats.values())}")
    for feat, count in global_feats.most_common():
        print(f"  Feature {feat}: {count}")

convert_folder("conllu_proiel", "conllu_ud")

100%|██████████| 1999/1999 [01:31<00:00, 21.84it/s]


=== Global Conversion Report ===
Unknown POS tags: 0

Unknown features: 0





In [2]:
import os

def merge_conllu_to_tsv(input_folder, output_file="merged.tsv", include_comments=False):
    """
    Concatenate all UD .conllu files in a folder into a single .tsv file.
    Each file is appended line by line. Optionally skip comments.
    """
    with open(output_file, "w", encoding="utf-8") as out:
        for fname in os.listdir(input_folder):
            if not fname.endswith(".conllu.ud") and not fname.endswith(".conllu"):
                continue
            in_path = os.path.join(input_folder, fname)
            with open(in_path, "r", encoding="utf-8") as f:
                for line in f:
                    if not include_comments and line.startswith("#"):
                        continue
                    if line.strip():  # only non-empty lines
                        out.write(line)
    print(f"Merged all .conllu files in '{input_folder}' into '{output_file}'")

merge_conllu_to_tsv("conllu_ud", "merged_ud.tsv", include_comments=False)

Merged all .conllu files in 'conllu_ud' into 'merged_ud.tsv'
