In [3]:
import os

def collect_and_inspect_data(folder_path):
    """
    Step 1: Scan each file and check basic integrity.
    For example, we ensure lines have token+label columns (e.g. tsv).
    """
    all_files = []
    for root, _, files in os.walk(folder_path):
        for name in files:
            if name.endswith(".txt") or name.endswith(".tsv"):
                file_path = os.path.join(root, name)
                with open(file_path, "r", encoding="utf-8") as f:
                    lines = f.readlines()
                # Basic check: ensure each line is token \t tag
                for i, line in enumerate(lines):
                    if line.strip() and "\t" not in line:
                        print(f"Warning: Missing tab delimiter in {file_path} line {i+1}")
                all_files.append(file_path)
    return all_files

def convert_files_to_token_tag_pairs(file_list):
    """
    Step 2: Convert data to list of (token, tag) pairs per sentence.
    Returns list of lists, where each sub-list is a sentence.
    """
    dataset = []
    for file_path in file_list:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()

        current_sentence = []
        for line in lines:
            line = line.strip()
            # Skip empty lines (start a new sentence)
            if not line:
                if current_sentence:
                    dataset.append(current_sentence)
                    current_sentence = []
                continue

            parts = line.split("\t")
            if len(parts) >= 2:
                token, tag = parts[0], parts[-1]
                current_sentence.append((token, tag))

        # Append last sentence if any
        if current_sentence:
            dataset.append(current_sentence)

    return dataset

# Example usage:
if __name__ == "__main__":
    train_files = collect_and_inspect_data("path/to/train")
    test_files = collect_and_inspect_data("path/to/test")
    eval_files = collect_and_inspect_data("path/to/eval")

    train_data = convert_files_to_token_tag_pairs(train_files)
    test_data = convert_files_to_token_tag_pairs(test_files)
    eval_data = convert_files_to_token_tag_pairs(eval_files)

    print("Example first sentence in train_data:", train_data[0] if train_data else "No data")

Example first sentence in train_data: No data
