In [7]:
import json
import re

In [8]:
def convert_to_conll(sentences, annotations):
    conll_format = []
    
    for sentence, ann in zip(sentences, annotations):
        tokens = sentence.split()  # Tokenize sentence by whitespace
        token_labels = ['O'] * len(tokens)
        
        for entity in ann:
            label = entity['label']
            entity_tokens = entity['word'].split()  # Split entity word into tokens if it's multi-word
            start_idx = sentence[:entity['start']].count(' ')
            
            # Label tokens with B- and I- prefixes
            token_labels[start_idx] = f"B-{label}"
            for i in range(1, len(entity_tokens)):
                token_labels[start_idx + i] = f"I-{label}"
        
        # Add each token and its label to the conll_format list
        for token, label in zip(tokens, token_labels):
            conll_format.append(f"{token}\t{label}")
        conll_format.append("")  # Blank line to separate sentences
    
    return "\n".join(conll_format)

In [9]:
# Sample input
sentences = [
    "Acrylic acid, an organic compound with the formula C₃H₄O₂, is a colorless liquid known for its pungent odor. Commonly polymerized to produce superabsorbent polymers, it often utilizes catalysts like potassium persulfate. Key properties include a boiling point of 141 °C and high reactivity, making it essential in various industrial applications.",
    "Salicylic acid, an organic compound with the formula C₇H₆O₃, is a colorless crystalline substance known for its medicinal properties. Commonly used in acne treatments, it often involves catalysts like sulfuric acid during synthesis. Key properties include a melting point of 158 °C and effective anti-inflammatory effects, making it widely utilized.",
    "Hyaluronic acid, an organic compound with the formula C₁₄H₂₁N₁₁O₁₁, is a key component in connective tissues and skin hydration. Often synthesized using catalysts like sodium hydroxide, it boasts impressive properties, including a high molecular weight of 1,000,000 Da, making it vital in dermatology and cosmetic applications.",
    "Butene, an organic compound with the formula C₄H₈, is a colorless gas used in the production of polymers and gasoline. Catalysts such as palladium are often employed in its hydrogenation processes. Key properties include a boiling point of -1 °C and its role as a crucial feedstock in petrochemical industries.",
    "Glycolic acid, an organic compound with the formula C₂H₄O₃, is a colorless, odorless liquid widely used in skincare products for its exfoliating properties. Catalysts like sulfuric acid are commonly used in its synthesis. Key properties include a melting point of 76 °C and its effectiveness in promoting skin rejuvenation.",
    "Ethyl alcohol, an organic compound with the formula C₂H₅OH, is a colorless liquid commonly used as a solvent and in beverages. It is often produced via fermentation, utilizing catalysts like yeast. Key properties include a boiling point of 78.37 °C and its role as a key ingredient in many industries.",
    "Benzene, an organic compound with the formula C₆H₆, is a colorless liquid known for its sweet odor and flammability. It serves as a precursor in various chemical syntheses, often utilizing catalysts like aluminum chloride in electrophilic aromatic substitutions. Key properties include a boiling point of 80.1 °C, making it essential in industry.",
    "Acetone, an organic compound with the formula C₃H₆O, is a colorless, volatile liquid commonly used as a solvent in laboratories and industries. It is often produced via the dehydrogenation of isopropanol using catalysts like zinc oxide. Key properties include a boiling point of 56.05 °C, making it highly effective in various applications.",
    "Benzoic acid, an organic compound with the formula C₇H₆O₂, is a white crystalline solid known for its preservative properties. It is commonly synthesized using catalysts like sulfuric acid in the reaction of toluene and carbon dioxide. Key properties include a melting point of 122 °C, making it useful in food preservation.",
    "Phenol, an organic compound with the formula C₆H₅OH, is a colorless, viscous liquid known for its antiseptic properties. It is typically produced via the catalytic hydroxylation of benzene using catalysts like sulfuric acid. Key properties include a melting point of 40.7 °C, making it significant in pharmaceuticals and chemical manufacturing."
]

annotations = [
        [
            {"label": "organic-chemicals", "word": "Acrylic acid", "start": 0, "end": 12},
            {"label": "organic-chemicals", "word": "C₃H₄O₂", "start": 35, "end": 40},
            {"label": "catalyst", "word": "potassium persulfate", "start": 106, "end": 125},
            {"label": "property", "word": "141 °C", "start": 162, "end": 168}
        ],
        [
            {"label": "organic-chemicals", "word": "Salicylic acid", "start": 0, "end": 14},
            {"label": "organic-chemicals", "word": "C₇H₆O₃", "start": 37, "end": 42},
            {"label": "catalyst", "word": "sulfuric acid", "start": 113, "end": 126},
            {"label": "property", "word": "158 °C", "start": 168, "end": 173}
        ],
        [
            {"label": "organic-chemicals", "word": "Hyaluronic acid", "start": 0, "end": 14},
            {"label": "organic-chemicals", "word": "C₁₄H₂₁N₁₁O₁₁", "start": 37, "end": 44},
            {"label": "catalyst", "word": "sodium hydroxide", "start": 106, "end": 122},
            {"label": "property", "word": "1,000,000 Da", "start": 179, "end": 192}
        ],
        [
            {"label": "organic-chemicals", "word": "Butene", "start": 0, "end": 6},
            {"label": "organic-chemicals", "word": "C₄H₈", "start": 36, "end": 40},
            {"label": "catalyst", "word": "palladium", "start": 100, "end": 109},
            {"label": "property", "word": "-1 °C", "start": 172, "end": 176}
        ],
        [
            {"label": "organic-chemicals", "word": "Glycolic acid", "start": 0, "end": 13},
            {"label": "organic-chemicals", "word": "C₂H₄O₃", "start": 36, "end": 41},
            {"label": "catalyst", "word": "sulfuric acid", "start": 110, "end": 123},
            {"label": "property", "word": "76 °C", "start": 179, "end": 184}
        ],
        [
            {"label": "organic-chemicals", "word": "Ethyl alcohol", "start": 0, "end": 12},
            {"label": "organic-chemicals", "word": "C₂H₅OH", "start": 34, "end": 40},
            {"label": "catalyst", "word": "yeast", "start": 115, "end": 120},
            {"label": "property", "word": "78.37 °C", "start": 177, "end": 185}
        ],
        [
            {"label": "organic-chemicals", "word": "Benzene", "start": 0, "end": 7},
            {"label": "organic-chemicals", "word": "C₆H₆", "start": 34, "end": 39},
            {"label": "catalyst", "word": "aluminum chloride", "start": 110, "end": 126},
            {"label": "property", "word": "80.1 °C", "start": 177, "end": 183}
        ],
        [
            {"label": "organic-chemicals", "word": "Acetone", "start": 0, "end": 7},
            {"label": "organic-chemicals", "word": "C₃H₆O", "start": 35, "end": 40},
            {"label": "catalyst", "word": "zinc oxide", "start": 114, "end": 124},
            {"label": "property", "word": "56.05 °C", "start": 179, "end": 186}
        ],
        [
            {"label": "organic-chemicals", "word": "Benzoic acid", "start": 0, "end": 12},
            {"label": "organic-chemicals", "word": "C₇H₆O₂", "start": 35, "end": 40},
            {"label": "catalyst", "word": "sulfuric acid", "start": 104, "end": 117},
            {"label": "property", "word": "122 °C", "start": 180, "end": 186}
        ],
        [
            {"label": "organic-chemicals", "word": "Phenol", "start": 0, "end": 6},
            {"label": "organic-chemicals", "word": "C₆H₅OH", "start": 30, "end": 36},
            {"label": "catalyst", "word": "sulfuric acid", "start": 124, "end": 137},
            {"label": "property", "word": "40.7 °C", "start": 188, "end": 194}
        ]  
]

In [10]:
# Function to update annotation start/end indices using regex
def update_annotations(sentences, annotations):
    updated_annotations = []
    
    for sentence, annotation_list in zip(sentences, annotations):
        updated_annotation = []
        
        for annotation in annotation_list:
            word = annotation['word']
            start_pos = annotation['start']
            end_pos = annotation['end']
            
            # Find the word using regex
            match = re.search(re.escape(word), sentence)
            if match:
                # Correct the start and end positions if necessary
                corrected_start = match.start()
                corrected_end = match.end()
                
                # Update the annotation with the corrected positions
                updated_annotation.append({
                    "label": annotation["label"],
                    "word": word,
                    "start": corrected_start,
                    "end": corrected_end
                })
            else:
                # In case the word is not found, keep the original annotation (though this shouldn't happen)
                updated_annotation.append(annotation)
        
        updated_annotations.append(updated_annotation)
    
    return updated_annotations


In [11]:
# Update the annotations with correct start and end positions
updated_new_annotations = update_annotations(sentences, annotations)

# Print the updated annotations
for updated in updated_new_annotations:
    for annotation in updated:
        print(annotation)
        

{'label': 'organic-chemicals', 'word': 'Acrylic acid', 'start': 0, 'end': 12}
{'label': 'organic-chemicals', 'word': 'C₃H₄O₂', 'start': 51, 'end': 57}
{'label': 'catalyst', 'word': 'potassium persulfate', 'start': 199, 'end': 219}
{'label': 'property', 'word': '141 °C', 'start': 263, 'end': 269}
{'label': 'organic-chemicals', 'word': 'Salicylic acid', 'start': 0, 'end': 14}
{'label': 'organic-chemicals', 'word': 'C₇H₆O₃', 'start': 53, 'end': 59}
{'label': 'catalyst', 'word': 'sulfuric acid', 'start': 201, 'end': 214}
{'label': 'property', 'word': '158 °C', 'start': 275, 'end': 281}
{'label': 'organic-chemicals', 'word': 'Hyaluronic acid', 'start': 0, 'end': 15}
{'label': 'organic-chemicals', 'word': 'C₁₄H₂₁N₁₁O₁₁', 'start': 54, 'end': 66}
{'label': 'catalyst', 'word': 'sodium hydroxide', 'start': 168, 'end': 184}
{'label': 'property', 'word': '1,000,000 Da', 'start': 256, 'end': 268}
{'label': 'organic-chemicals', 'word': 'Butene', 'start': 0, 'end': 6}
{'label': 'organic-chemicals', '

In [15]:
def convert_annotations(annotations):
    
    entities = []
    current_entity = None

    for annotation in annotations:
        label = annotation['label']
        start = annotation['start']
        end = annotation['end']

        if current_entity is None or current_entity['label'] != label:
            current_entity = {'start': start, 'end': end, 'label': f"B-{label}"}
            entities.append(current_entity)
        else:
            current_entity['end'] = end
            entities.append({'start': start, 'end': end, 'label': f"I-{label}"})

    return {'labels': {'entities': entities}}

i = 0 
for updated in updated_new_annotations:
    result = json.dumps(convert_annotations(updated))
    lab = '{"text": '+'"' + sentences[i] + '"' +','+ result
    print(lab)
    i+=1

{"text": Acrylic acid, an organic compound with the formula C₃H₄O₂, is a colorless liquid known for its pungent odor. Commonly polymerized to produce superabsorbent polymers, it often utilizes catalysts like potassium persulfate. Key properties include a boiling point of 141 °C and high reactivity, making it essential in various industrial applications.,{"labels": {"entities": [{"start": 0, "end": 12, "label": "B-organic-chemicals"}, {"start": 51, "end": 57, "label": "B-organic-chemicals"}, {"start": 199, "end": 219, "label": "B-catalyst"}, {"start": 263, "end": 269, "label": "B-property"}]}}
{"text": Salicylic acid, an organic compound with the formula C₇H₆O₃, is a colorless crystalline substance known for its medicinal properties. Commonly used in acne treatments, it often involves catalysts like sulfuric acid during synthesis. Key properties include a melting point of 158 °C and effective anti-inflammatory effects, making it widely utilized.,{"labels": {"entities": [{"start": 0, "en

In [6]:
# Convert to CoNLL format
conll_output = convert_to_conll(sentences, updated_new_annotations)
print(conll_output)

Acrylic	B-organic-chemicals
acid,	I-organic-chemicals
an	O
organic	O
compound	O
with	O
the	O
formula	O
C₃H₄O₂,	B-organic-chemicals
is	O
a	O
colorless	O
liquid	O
known	O
for	O
its	O
pungent	O
odor.	O
Commonly	O
polymerized	O
to	O
produce	O
superabsorbent	O
polymers,	O
it	O
often	O
utilizes	O
catalysts	O
like	O
potassium	B-catalyst
persulfate.	I-catalyst
Key	O
properties	O
include	O
a	O
boiling	O
point	O
of	O
141	B-property values
°C	I-property values
and	O
high	O
reactivity,	O
making	O
it	O
essential	O
in	O
various	O
industrial	O
applications.	O

Salicylic	B-organic-chemicals
acid,	I-organic-chemicals
an	O
organic	O
compound	O
with	O
the	O
formula	O
C₇H₆O₃,	B-organic-chemicals
is	O
a	O
colorless	O
crystalline	O
substance	O
known	O
for	O
its	O
medicinal	O
properties.	O
Commonly	O
used	O
in	O
acne	O
treatments,	O
it	O
often	O
involves	O
catalysts	O
like	O
sulfuric	B-catalyst
acid	I-catalyst
during	O
synthesis.	O
Key	O
properties	O
include	O
a	O
melting	O
point	O
of	O
158	B-property values

In [7]:
# Save to a file
with open('data.conll', 'w', encoding='utf-8') as f:
    f.write(conll_output)

print("CoNLL formatted data saved to 'output.conll'.")

CoNLL formatted data saved to 'output.conll'.
