# Import required libraries

In [1]:
import json 
from typing import List, Dict, Any
from datetime import datetime
import uuid

# Defining Important Functions

In [2]:
def convert_conll_to_labelstudio(conll_file_path: str) -> List[Dict[str, Any]]:
    """
    Converts a CoNLL format file to Label Studio JSON format for named entity recognition tasks.
    
    Args:
        conll_file_path (str): Path to the input CoNLL file
        
    Returns:
        List[Dict[str, Any]]: List of Label Studio compatible JSON objects
    """
    sentences = []
    current_sentence = []
    label_id = 1

    with open(conll_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == '' or line.startswith('-DOCSTART-'):
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                current_sentence.append(line.strip().split())

    if current_sentence:
        sentences.append(current_sentence)

    output = []
    for idx, sentence in enumerate(sentences, start=1):
        text = ' '.join(token[0] for token in sentence)
        annotations = []
        start = 0
        current_label = None
        label_tokens = []
        label_start = 0

        for token in sentence:
            word, _, _, label = token
            end = start + len(word)
            
            # Check if we're continuing an entity or need to start a new one
            if label.startswith("B-") or (label != "O" and label.split("-")[1] != current_label):
                # Append current entity annotation if exists
                if label_tokens:
                    annotations.append({
                        "value": {
                            "start": label_start,
                            "end": start - 1,
                            "text": ' '.join(label_tokens),
                            "labels": [current_label]
                        },
                        "id": f"label_{uuid.uuid4()}",
                        "from_name": "label",
                        "to_name": "text",
                        "type": "labels",
                        "origin": "manual"
                    })
                    label_tokens = []

                # Start new entity
                current_label = label.split("-")[1]
                label_tokens = [word]
                label_start = start

            elif label.startswith("I-") and label.split("-")[1] == current_label:
                # Continue the current entity
                label_tokens.append(word)

            else:
                # End current entity if there's one
                if label_tokens:
                    annotations.append({
                        "value": {
                            "start": label_start,
                            "end": start - 1,
                            "text": ' '.join(label_tokens),
                            "labels": [current_label]
                        },
                        "id": f"label_{uuid.uuid4()}",
                        "from_name": "label",
                        "to_name": "text",
                        "type": "labels",
                        "origin": "manual"
                    })
                    label_tokens = []
                current_label = None

            start = end + 1  # +1 for the space

        # Add any remaining entity at the end of the sentence
        if label_tokens:
            annotations.append({
                "value": {
                    "start": label_start,
                    "end": start - 1,
                    "text": ' '.join(label_tokens),
                    "labels": [current_label]
                },
                "id": f"label_{uuid.uuid4()}",
                "from_name": "label",
                "to_name": "text",
                "type": "labels",
                "origin": "manual"
            })

        current_time = datetime.utcnow().isoformat() + "Z"
        output.append({
            "id": idx,
            "annotations": [{
                "id": idx,
                "completed_by": 1,
                "result": annotations,
                "was_cancelled": False,
                "ground_truth": False,
                "created_at": current_time,
                "updated_at": current_time,
                "lead_time": 41.132,
                "prediction": {},
                "result_count": len(annotations),
                "task": idx,
                "project": 1,
                "updated_by": 1
            }],
            "data": {"text": text},
            "meta": {},
            "created_at": current_time,
            "updated_at": current_time,
            "inner_id": idx,
            "total_annotations": 1,
            "cancelled_annotations": 0,
            "total_predictions": 0,
            "project": 1,
            "updated_by": 1
        })

    return output

# Script execution entry point

In [3]:
# Locate your conll files
if __name__ == "__main__":
    conll_file_path = r"c:\Users\Sakib Ahmed\Downloads\Projekt 8 Dec 6 2024.conll"  # Replace with your actual file path
    result = convert_conll_to_labelstudio(conll_file_path)
    
    output_file_path = "labelstudio_output.json"
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"Conversion completed. Output saved to {output_file_path}")

  current_time = datetime.utcnow().isoformat() + "Z"


Conversion completed. Output saved to labelstudio_output.json
