In [None]:
import pandas as pd
# import train_test_split from sklearn
from sklearn.model_selection import train_test_split

data = pd.read_csv('../datasets/train_raw.csv', sep=';')

train, test = train_test_split(data, test_size=0.3, random_state=42)
test, valid = train_test_split(test, test_size=0.5, random_state=42)

train.to_csv('../datasets/train.csv', sep='\t', index=False, header=False)
test.to_csv('../datasets/test.csv', sep='\t', index=False, header=False)
valid.to_csv('../datasets/valid.csv', sep='\t', index=False, header=False)

In [9]:
import ast
import csv
import re
from pathlib import Path

NEWLINE = chr(10)



BASE_DIR = Path(__file__).resolve().parent if '__file__' in globals() else Path().resolve()
PROJECT_ROOT = '..'

# Load CSV/TSV lines and keep only non-empty ones

def read_raw_lines(path: Path) -> list[str]:
    for encoding in ('utf-8-sig', 'utf-8', 'cp1251'):
        try:
            return [line.strip() for line in path.read_text(encoding=encoding).splitlines() if line.strip()]
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError('utf-8', b'', 0, 1, 'Unable to decode input file')

# Convert raw lines into (sample, annotation) pairs even when header/delimiter differ

def parse_dataset(path: Path) -> list[dict[str, str]]:
    lines = read_raw_lines(path)
    if not lines:
        return []
    delimiter = next((d for d in (';', '	', ',') if d in lines[0]), ';')
    rows = list(csv.reader(lines, delimiter=delimiter))
    if not rows:
        return []
    header = [cell.strip().lower() for cell in rows[0]]
    has_header = 'sample' in header and 'annotation' in header
    if has_header:
        idx_sample = header.index('sample')
        idx_annotation = header.index('annotation')
        data_rows = rows[1:]
    else:
        idx_sample = 0
        idx_annotation = 1
        data_rows = rows
    parsed = []
    for row in data_rows:
        if len(row) <= max(idx_sample, idx_annotation):
            continue
        parsed.append({
            'sample': row[idx_sample].strip(),
            'annotation': row[idx_annotation].strip(),
        })
    return parsed

# Parse annotation tuples stored as strings in the CSV

def read_spans(span_text: str) -> list[tuple[int, int, str]]:
    if not span_text or str(span_text).lower() == 'nan':
        return []
    spans = ast.literal_eval(span_text)
    normalized = []
    for start, end, label in spans:
        normalized.append((int(start), int(end), label))
    return normalized

# Paint character-level BIO tags that we can later map to tokens

def build_char_tags(text: str, spans: list[tuple[int, int, str]]) -> list[str]:
    char_tags = ['O'] * len(text)
    for start, end, label in spans:
        if not label or label == 'O':
            continue
        if '-' in label:
            prefix, entity_type = label.split('-', 1)
        else:
            prefix, entity_type = 'B', label
        start_idx = max(0, int(start))
        end_idx = min(len(text), int(end))
        if start_idx >= end_idx:
            continue
        for idx in range(start_idx, end_idx):
            if prefix == 'I' and idx == start_idx:
                char_tags[idx] = f'I-{entity_type}'
            else:
                char_tags[idx] = ('B-' if idx == start_idx else 'I-') + entity_type
    return char_tags

# Collapse character labels into token-level BIO rows mirroring the example dataset

def sample_to_bio_rows(text: str, char_tags: list[str]) -> list[str]:
    rows = []
    for match in re.finditer(r'\S+', text):
        token = match.group()
        start, end = match.span()
        tag = 'O'
        for idx in range(start, min(end, len(char_tags))):
            if char_tags[idx] != 'O':
                tag = char_tags[idx]
                break
        rows.append(f'{token} {tag}')
    return rows

# Convert the entire CSV to a CoNLL-style file with blank lines between samples

def convert_csv_to_conll(input_path, output_path):
    input_path = Path(input_path)
    output_path = Path(output_path)
    lines = []
    for row in parse_dataset(input_path):
        text = row.get('sample', '')
        spans = read_spans(row.get('annotation'))
        char_tags = build_char_tags(text, spans)
        lines.extend(sample_to_bio_rows(text, char_tags))
        lines.append('')
    output_text = NEWLINE.join(lines).rstrip(NEWLINE) + NEWLINE
    output_path.write_text(output_text, encoding='utf-8')
    return output_path

input_csv = '../datasets/train.csv'
output_conll = '../datasets/conll/train.txt'

output_path = convert_csv_to_conll(input_csv, output_conll)

input_csv = '../datasets/valid.csv'
output_conll = '../datasets/conll/valid.txt'

output_path = convert_csv_to_conll(input_csv, output_conll)

input_csv = '../datasets/test.csv'
output_conll = '../datasets/conll/test.txt'

output_path = convert_csv_to_conll(input_csv, output_conll)
