# Import required libraries

In [7]:
import json
from typing import List, Dict, Any
from datetime import datetime

In [None]:
def convert_conll_to_labelstudio(conll_file_path: str) -> List[Dict[str, Any]]:
    """
    Convert CoNLL format file to Label Studio JSON format.
    
    Args:
        conll_file_path (str): Path to the input CoNLL file
        
    Returns:
        List[Dict[str, Any]]: List of Label Studio compatible JSON objects
    """
    sentences = []
    current_sentence = []
    label_id = 1

    with open(conll_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == '' or line.startswith('-DOCSTART-'):
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                current_sentence.append(line.strip().split())

    if current_sentence:
        sentences.append(current_sentence)

    output = []
    for idx, sentence in enumerate(sentences, start=1):
        text = ' '.join(token[0] for token in sentence)
        annotations = []
        start = 0

        for token in sentence:
            word, _, _, label = token
            end = start + len(word)
            if label != 'O':
                annotations.append({
                    "value": {
                        "start": start,
                        "end": end,
                        "text": word,
                        "labels": [label.split('-')[-1]]
                    },
                    "id": f"label_{label_id}",
                    "from_name": "label",
                    "to_name": "text",
                    "type": "labels",
                    "origin": "manual"
                })
                label_id += 1
            start = end + 1  # +1 for the space

        current_time = datetime.utcnow().isoformat() + "Z"
        output.append({
            "id": idx,
            "annotations": [{
                "id": idx,
                "completed_by": 1,
                "result": annotations,
                "was_cancelled": False,
                "ground_truth": False,
                "created_at": current_time,
                "updated_at": current_time,
                "lead_time": 41.132,
                "prediction": {},
                "result_count": 0,
                "task": idx,
                "project": 1,
                "updated_by": 1
            }],
            "file_upload": "conll-sentences.txt",
            "drafts": [],
            "predictions": [],
            "data": {"text": text},
            "meta": {},
            "created_at": current_time,
            "updated_at": current_time,
            "inner_id": idx,
            "total_annotations": 1,
            "cancelled_annotations": 0,
            "total_predictions": 0,
            "project": 1,
            "updated_by": 1
        })

    return output

# Main execution block

In [9]:
# Locate your conll files
if __name__ == "__main__":
    conll_file_path = r"d:\data\train.conll"  # Replace with your actual file path
    result = convert_conll_to_labelstudio(conll_file_path)
    
    output_file_path = "labelstudio_output.json"
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"Conversion completed. Output saved to {output_file_path}")

  current_time = datetime.utcnow().isoformat() + "Z"


Conversion completed. Output saved to labelstudio_output.json
