
# MTL-Bioinformatics-2016 - Preparing Data for BioBERT Training

This notebook automates the process of:
- Loading all datasets from MTL-Bioinformatics-2016
- Converting the data into a format compatible with Hugging Face Transformers
- Creating train, dev, and test splits ready for BioBERT fine-tuning

The datasets will be combined into a single training corpus if desired, or you can train on individual datasets.


In [50]:

import os
import pandas as pd
from collections import defaultdict

# Base folder for datasets
base_folder = '/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/MTL-Bioinformatics-2016_1/data/67'


# Function to read CoNLL files into sentences
def read_conll_file(filepath):
    sentences = []
    sentence = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                sentence.append(line.strip().split())
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    if sentence:
        sentences.append(sentence)
    return sentences

# Convert sentences into Hugging Face-friendly format
def convert_to_hf_format(sentences):
    examples = []
    for sentence in sentences:
        tokens = [token for token, tag in sentence]
        tags = [tag for token, tag in sentence]
        examples.append({"tokens": tokens, "tags": tags})
    return examples


In [51]:

# Dictionary to hold all datasets' splits
all_datasets = defaultdict(lambda: {"train": [], "dev": [], "test": []})

# Loop through datasets in the data folder
for dataset in os.listdir(base_folder):
    dataset_folder = os.path.join(base_folder, dataset)
    if not os.path.isdir(dataset_folder):
        continue

    train_file = os.path.join(dataset_folder, 'train.tsv')
    test_file = os.path.join(dataset_folder, 'test.tsv')
    dev_file = os.path.join(dataset_folder, 'devel.tsv')

    train_data = read_conll_file(train_file) if os.path.exists(train_file) else []
    test_data = read_conll_file(test_file) if os.path.exists(test_file) else []
    dev_data = read_conll_file(dev_file) if os.path.exists(dev_file) else []

    all_datasets[dataset]["train"] = convert_to_hf_format(train_data)
    all_datasets[dataset]["dev"] = convert_to_hf_format(dev_data)
    all_datasets[dataset]["test"] = convert_to_hf_format(test_data)

    print(f"{dataset} - Train: {len(train_data)}, Dev: {len(dev_data)}, Test: {len(test_data)}")


NCBI-disease-IOBES - Train: 5424, Dev: 923, Test: 940


In [52]:

# Combine all datasets into a single training/dev/test set if desired
combined_train = []
combined_dev = []
combined_test = []

for dataset, splits in all_datasets.items():
    combined_train.extend(splits['train'])
    combined_dev.extend(splits['dev'])
    combined_test.extend(splits['test'])

print(f"Combined dataset size - Train: {len(combined_train)}, Dev: {len(combined_dev)}, Test: {len(combined_test)}")


Combined dataset size - Train: 5424, Dev: 923, Test: 940


In [53]:

import json

output_folder = '/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files'
os.makedirs(output_folder, exist_ok=True)

def save_jsonl(data, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        for example in data:
            f.write(json.dumps(example) + '\n')

save_jsonl(combined_train, os.path.join(output_folder, 'combined_train_67.jsonl'))
save_jsonl(combined_dev, os.path.join(output_folder, 'combined_dev_67.jsonl'))
save_jsonl(combined_test, os.path.join(output_folder, 'combined_test_67.jsonl'))

print("Saved combined datasets for Hugging Face training.")


Saved combined datasets for Hugging Face training.


In [54]:
# Reattempting to process the dataset with one token per label

# Load dataset again
file_path = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/combined_train_67.jsonl"

# Read JSONL file into a list
data = []
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        data.append(json.loads(line.strip()))

# Convert to one token per label
processed_data = []
for entry in data:
    tokens = entry["tokens"]
    tags = entry["tags"]
    
    for token, tag in zip(tokens, tags):
        processed_data.append({"tokens": [token], "tags": [tag]})

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Display the processed dataset
print(df.head())

# Save the processed data as JSONL file
output_file = file_path
with open(output_file, "w") as f:
    for entry in processed_data:
        f.write(json.dumps(entry) + "\n")


             tokens tags
0  [Identification]  [O]
1              [of]  [O]
2            [APC2]  [O]
3               [,]  [O]
4               [a]  [O]


In [55]:
# Reattempting to process the dataset with one token per label

# Load dataset again
file_path = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/combined_test_67.jsonl"

# Read JSONL file into a list
data = []
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        data.append(json.loads(line.strip()))

# Convert to one token per label
processed_data = []
for entry in data:
    tokens = entry["tokens"]
    tags = entry["tags"]
    
    for token, tag in zip(tokens, tags):
        processed_data.append({"tokens": [token], "tags": [tag]})

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Display the processed dataset
print(df.head())

# Save the processed data as JSONL file
output_file = file_path
with open(output_file, "w") as f:
    for entry in processed_data:
        f.write(json.dumps(entry) + "\n")


         tokens tags
0  [Clustering]  [O]
1          [of]  [O]
2    [missense]  [O]
3   [mutations]  [O]
4          [in]  [O]


In [56]:
# Reattempting to process the dataset with one token per label

# Load dataset again
file_path = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/combined_dev_67.jsonl"

# Read JSONL file into a list
data = []
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        data.append(json.loads(line.strip()))

# Convert to one token per label
processed_data = []
for entry in data:
    tokens = entry["tokens"]
    tags = entry["tags"]
    
    for token, tag in zip(tokens, tags):
        processed_data.append({"tokens": [token], "tags": [tag]})

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Display the processed dataset
print(df.head())

# Save the processed data as JSONL file
output_file = file_path
with open(output_file, "w") as f:
    for entry in processed_data:
        f.write(json.dumps(entry) + "\n")


       tokens tags
0     [BRCA1]  [O]
1        [is]  [O]
2  [secreted]  [O]
3       [and]  [O]
4  [exhibits]  [O]
