# Importing necessary files

In [1]:
import re
from collections import defaultdict, Counter

# Defining the core functions

In [10]:
class ConllEditor:
    def __init__(self, file_path, encoding='utf-8'):
        self.file_path = file_path
        self.encoding = encoding
        try:
            self.data = self._load_data()
        except UnicodeDecodeError:
            # If UTF-8 fails, try with different encodings
            encodings_to_try = ['latin1', 'iso-8859-1', 'cp1252']
            for enc in encodings_to_try:
                try:
                    self.encoding = enc
                    self.data = self._load_data()
                    print(f"Successfully loaded file using {enc} encoding")
                    break
                except UnicodeDecodeError:
                    continue
            else:
                raise UnicodeDecodeError(f"Could not read file with any of these encodings: utf-8, {', '.join(encodings_to_try)}")

    def _load_data(self):
        with open(self.file_path, 'r', encoding=self.encoding) as f:
            lines = f.readlines()
        return [line.strip() for line in lines]

    def view_annotations(self):
        annotations = [line for line in self.data if line and not line.startswith("-DOCSTART-")]
        annotation_count = len(annotations)
        for annotation in annotations:
            print(annotation)
        print(f"\nTotal number of tokens: {annotation_count}")
    
    def label_stats(self):
        label_counter = Counter()
        total_labels = 0
        
        for line in self.data:
            if line and not line.startswith("-DOCSTART-"):
                label = line.split()[-1]
                label_counter[label] += 1
                total_labels += 1

        unique_labels = len(label_counter)
        
        # Custom sorting function
        def sort_key(label):
            if label == 'O':
                return ('0', '')  # Make 'O' come first
            prefix = label[:2]  # B- or I-
            entity = label[2:]  # The entity type after B- or I-
            return (entity, prefix)  # Sort by entity type first, then B/I prefix
        
        # Sort and print labels
        sorted_labels = sorted(label_counter.items(), key=lambda x: sort_key(x[0]))
        for label, count in sorted_labels:
            print(f"Label: {label}, Count: {count}")
        
        print(f"\nTotal number of labels found: {total_labels}")
        print(f"Total number of unique tags: {unique_labels}")

    
    def search_by_label(self, label):
        matches = [(index, line) for index, line in enumerate(self.data) if line and not line.startswith("-DOCSTART-") and line.endswith(label)]
        token_count = len(matches)
        
        # Count sentences containing the label
        sentence_count = 0
        current_sentence_has_label = False

        for line in self.data:
            if not line:  # Empty line indicates end of sentence
                if current_sentence_has_label:
                    sentence_count += 1
                current_sentence_has_label = False
            elif line.endswith(label):
                current_sentence_has_label = True

        # Check the last sentence if it doesn't end with an empty line
        if current_sentence_has_label:
            sentence_count += 1

        # Print matches with line numbers
        for index, match in matches:
            print(f"Line {index + 1}: {match}")

        print(f"\nNumber of tokens found with label '{label}': {token_count}")
        print(f"Number of sentences containing label '{label}': {sentence_count}")

    def search_by_token(self, token):
        matches = [(index, line) for index, line in enumerate(self.data) if line and not line.startswith("-DOCSTART-") and token in line]
        token_count = len(matches)

        # Count sentences containing the token
        sentence_count = 0
        current_sentence_has_token = False

        for line in self.data:
            if not line:  # Empty line indicates end of sentence
                if current_sentence_has_token:
                    sentence_count += 1
                current_sentence_has_token = False
            elif token in line:
                current_sentence_has_token = True

        # Check the last sentence if it doesn't end with an empty line
        if current_sentence_has_token:
            sentence_count += 1

        # Print matches with line numbers
        for index, match in matches:
            print(f"Line {index + 1}: {match}")

        print(f"\nNumber of tokens found with '{token}': {token_count}")
        print(f"Number of sentences containing '{token}': {sentence_count}")

    def remove_label(self, label_to_remove):
        new_data = []
        for line in self.data:
            if line.endswith(label_to_remove):
                new_data.append(re.sub(rf"\s{label_to_remove}$", " O", line))
            else:
                new_data.append(line)
        self.data = new_data
        print(f"Label '{label_to_remove}' removed.")
    
    def merge_labels(self, labels_to_merge, new_label):
        new_data = []
        for line in self.data:
            if any(line.endswith(label) for label in labels_to_merge):
                new_data.append(re.sub(rf"\s({'|'.join(labels_to_merge)})$", f" {new_label}", line))
            else:
                new_data.append(line)
        self.data = new_data
        print(f"Labels {labels_to_merge} merged into '{new_label}'.")
    
    def rename_labels(self, label_mapping):
        new_data = []
        for line in self.data:
            if line and not line.startswith("-DOCSTART-"):
                parts = line.split()
                if parts:
                    label = parts[-1]
                    if label in label_mapping:
                        parts[-1] = label_mapping[label]
                    new_data.append(" ".join(parts))
            else:
                new_data.append(line)
        self.data = new_data
        print(f"Labels renamed according to {label_mapping}.")

    def delete_sentences_with_label(self, label_to_delete):
        new_data = []
        current_sentence = []
        sentence_to_delete = False
        sentences_deleted = 0
        tokens_deleted = 0

        for line in self.data:
            if line.startswith("-DOCSTART-"):
                new_data.append(line)
                continue

            if not line:
                if current_sentence and not sentence_to_delete:
                    new_data.extend(current_sentence)
                    new_data.append(line)
                else:
                    if sentence_to_delete:
                        sentences_deleted += 1
                        tokens_deleted += len(current_sentence)
                current_sentence = []
                sentence_to_delete = False
            else:
                current_sentence.append(line)
                if line.split()[-1] == label_to_delete:
                    sentence_to_delete = True

        # Handle the last sentence if it exists
        if current_sentence:
            if sentence_to_delete:
                sentences_deleted += 1
                tokens_deleted += len(current_sentence)
            else:
                new_data.extend(current_sentence)

        self.data = new_data
        print(f"Sentences containing the label '{label_to_delete}' have been deleted.")
        print(f"Number of sentences deleted: {sentences_deleted}")
        print(f"Number of tokens deleted: {tokens_deleted}")

    def delete_sentences_without_annotations(self):
        new_data = []
        current_sentence = []
        sentence_has_annotation = False
        sentences_deleted = 0
        tokens_deleted = 0

        for line in self.data:
            if line.startswith("-DOCSTART-"):
                new_data.append(line)
                continue

            if not line:
                if current_sentence and sentence_has_annotation:
                    new_data.extend(current_sentence)
                    new_data.append(line)
                else:
                    if current_sentence:
                        sentences_deleted += 1
                        tokens_deleted += len(current_sentence)
                current_sentence = []
                sentence_has_annotation = False
            else:
                current_sentence.append(line)
                if line.split()[-1] != "O":
                    sentence_has_annotation = True

        # Handle the last sentence if it exists
        if current_sentence:
            if sentence_has_annotation:
                new_data.extend(current_sentence)
            else:
                sentences_deleted += 1
                tokens_deleted += len(current_sentence)

        self.data = new_data
        print(f"Sentences without annotations have been deleted successfully.")
        print(f"Number of sentences deleted: {sentences_deleted}")
        print(f"Number of tokens deleted: {tokens_deleted}")

    def save(self, output_path):
        try:
            with open(output_path, 'w', encoding=self.encoding) as f:
                f.write("\n".join(self.data) + "\n")
            print(f"Updated file saved to {output_path} using {self.encoding} encoding")
        except UnicodeEncodeError:
            # If the original encoding fails, try UTF-8
            try:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write("\n".join(self.data) + "\n")
                print(f"Updated file saved to {output_path} using utf-8 encoding")
            except UnicodeEncodeError as e:
                raise UnicodeEncodeError(f"Failed to save file with both {self.encoding} and utf-8 encodings: {str(e)}")

In [11]:
# Initialize the editor with the CoNLL file path
editor = ConllEditor(r'c:\Users\Sakib Ahmed\Downloads\Projekt 5 Dez 04 2024.conll')

In [4]:
# 1. View Annotations
editor.view_annotations()

Leukozyten -X- _ B-BIOMARKER
4.3-10.8 -X- _ B-REFERENCE
G -X- _ B-UNIT
/ -X- _ O
l -X- _ B-VALUE
8.18 -X- _ I-VALUE
Erythrozyten -X- _ B-BIOMARKER
3.8-5.2 -X- _ B-REFERENCE
T -X- _ O
/ -X- _ B-VALUE
l -X- _ I-VALUE
4.56 -X- _ I-VALUE
Hämoglobin -X- _ B-BIOMARKER
120-160 -X- _ B-REFERENCE
g -X- _ B-UNIT
/ -X- _ I-UNIT
l -X- _ I-UNIT
139.0 -X- _ B-VALUE
Hämatokrit -X- _ B-BIOMARKER
0,35-0,47 -X- _ B-REFERENCE
l -X- _ B-UNIT
/ -X- _ I-UNIT
l -X- _ O
0.412 -X- _ B-VALUE
MCV -X- _ B-BIOMARKER
80-95 -X- _ B-REFERENCE
fl -X- _ B-UNIT
90.4 -X- _ B-VALUE
MCH -X- _ B-BIOMARKER
27-32 -X- _ B-REFERENCE
pg -X- _ B-UNIT
30.5 -X- _ B-VALUE
MCHC -X- _ B-BIOMARKER
310-360 -X- _ B-REFERENCE
g -X- _ B-UNIT
/ -X- _ I-UNIT
l -X- _ I-UNIT
337 -X- _ B-VALUE
Thrombozyten -X- _ B-BIOMARKER
150-450 -X- _ B-REFERENCE
G -X- _ B-UNIT
/ -X- _ I-UNIT
182 -X- _ B-VALUE
Mittleres -X- _ B-BIOMARKER
Plättchen -X- _ I-BIOMARKER
Volumen -X- _ I-BIOMARKER
6,4-9,7 -X- _ B-REFERENCE
fl -X- _ B-UNIT
11.1+ -X- _ B-VALUE
RDW -X

In [12]:
# 2. Label Statistics
editor.label_stats()

Label: O, Count: 37
Label: B-BIOMARKER, Count: 124
Label: I-BIOMARKER, Count: 146
Label: B-COMMENT, Count: 3
Label: B-REFERENCE, Count: 112
Label: I-REFERENCE, Count: 41
Label: B-UNIT, Count: 92
Label: I-UNIT, Count: 103
Label: B-VALUE, Count: 107
Label: I-VALUE, Count: 8

Total number of labels found: 773
Total number of unique tags: 10


In [None]:
# 3. Search Annotations with a specific label
editor.search_by_label('B-PER')

Line 3: Hulls -X- _ B-PER
Line 188: Areces -X- _ B-PER
Line 195: Juan -X- _ B-PER
Line 442: Jaime -X- _ B-PER
Line 446: Chevenement -X- _ B-PER
Line 680: Luis -X- _ B-PER
Line 749: Conchita -X- _ B-PER
Line 764: Martina -X- _ B-PER
Line 809: SuÃ¡rez -X- _ B-PER
...
Line 109121: Samani -X- _ B-PER

Number of tokens found with label 'B-PER': 1669
Number of sentences containing label 'B-PER': 1079


In [None]:
# 4. Search Annotations with a specific label
editor.search_by_token("Florida")

Line 26203: Florida -X- _ B-LOC

Number of tokens found with 'Florida': 1
Number of sentences containing 'Florida': 1


In [10]:
# 5. Remove specific label
editor.remove_label('B-PER')

Label 'B-PER' removed.


In [11]:
# 6. Merge multiple labels into one
editor.merge_labels(['B-MISC', 'I-MISC', 'B-ORG'], 'C-MISC')

Labels ['B-MISC', 'I-MISC', 'B-ORG'] merged into 'C-MISC'.


In [12]:
# Rechecking Label Statistics
editor.label_stats()

Label: O, Count: 236241
Label: C-MISC, Count: 12775
Label: B-LOC, Count: 4913
Label: I-ORG, Count: 4992
Label: I-LOC, Count: 1891
Label: I-PER, Count: 3903

Total number of labels found: 264715


In [13]:
# 7. Rename labels based on JSON mapping
editor.rename_labels({
    'I-PER':'A-MISC',
    'B-LOC':'A-LOC'
})

Labels renamed according to {'I-PER': 'A-MISC', 'B-LOC': 'A-LOC'}.


In [14]:
# Rechecking Label Statistics
editor.label_stats()

Label: O, Count: 236241
Label: C-MISC, Count: 12775
Label: A-LOC, Count: 4913
Label: I-ORG, Count: 4992
Label: I-LOC, Count: 1891
Label: A-MISC, Count: 3903

Total number of labels found: 264715


In [15]:
# 8. Delete an entire sentence containing an specific label
editor.delete_sentences_with_label("I-LOC")

Sentences containing the label 'I-LOC' have been deleted.
Number of sentences deleted: 413
Number of tokens deleted: 14029


In [16]:
# 9. Delete an entire sentence containing no label
editor.delete_sentences_without_annotations()


Sentences without annotations have been deleted successfully.
Number of sentences deleted: 2123
Number of tokens deleted: 31337
      


In [17]:
# Save the updated CoNLL file
editor.save('updated_conll_file.conll')


Updated file saved to updated_conll_file.conll
      


# Some additional scripts

### Non-ASCII lines remover

In [None]:
import re

def clean_lines_with_gaps(file_path):
    cleaned_lines = []
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        lines = file.readlines()
    
    for line in lines:
        # Check if line is either empty or contains only ASCII characters
        if line.strip() == '' or re.match(r'^[\x00-\x7F]+$', line.strip()):
            cleaned_lines.append(line)  # Keep the line

    # Save cleaned lines to a new file
    with open('output.conll', 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)
    
    print("Non-ASCII tokens removed successfully")

# Use the path to your .conll file here
clean_lines_with_gaps(r'updated_conll_file.conll')

### Duplicates Remover

In [None]:
import os
import hashlib
import glob
from io import StringIO

# Configuration
DATA_DIR = './data'
OUTPUT_DIR = os.path.join(DATA_DIR, 'removed_duplicates')
ALLOWED_EXTENSIONS = {'conll'}

# Create necessary directories
os.makedirs(OUTPUT_DIR, exist_ok=True)

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def get_next_file_number():
    existing_files = glob.glob(os.path.join(OUTPUT_DIR, 'cleaned_*.conll'))
    if not existing_files:
        return 1
    numbers = [int(f.split('_')[-1].split('.')[0]) for f in existing_files]
    return max(numbers) + 1

def parse_conll_content(content):
    """Parse CoNLL content into sentences"""
    sentences = []
    current_sentence = []
    
    for line in content.split('\n'):
        line = line.strip()
        
        # Skip DOCSTART
        if line.startswith('-DOCSTART-'):
            continue
            
        if line:
            current_sentence.append(line)
        elif current_sentence:  # Empty line and we have a sentence
            sentences.append('\n'.join(current_sentence))
            current_sentence = []
            
    # Add last sentence if exists
    if current_sentence:
        sentences.append('\n'.join(current_sentence))
        
    return sentences

def remove_duplicates(content):
    """Remove duplicate sentences and track removed ones"""
    # Initialize output buffers
    cleaned_output = StringIO()
    removed_output = StringIO()
    
    # Add headers
    cleaned_output.write('-DOCSTART- -X- O O\n\n')
    removed_output.write('-DOCSTART- -X- O O\n\n')
    
    # Parse into sentences
    sentences = parse_conll_content(content)
    
    # Track unique and duplicate sentences
    seen_sentences = {}  # Hash -> First occurrence index
    unique_sentences = []
    removed_sentences = []
    
    for idx, sentence in enumerate(sentences):
        # Create hash of the sentence
        sentence_hash = hashlib.md5(sentence.encode()).hexdigest()
        
        if sentence_hash not in seen_sentences:
            seen_sentences[sentence_hash] = idx
            unique_sentences.append(sentence)
        else:
            # Store duplicate with its position information
            original_pos = seen_sentences[sentence_hash] + 1  # 1-based indexing
            current_pos = idx + 1
            removed_sentences.append((sentence, original_pos, current_pos))
    
    # Write unique sentences to cleaned output
    for sentence in unique_sentences:
        cleaned_output.write(sentence + '\n\n')
    
    # Write removed sentences to removed output with position information
    for sentence, original_pos, current_pos in removed_sentences:
        removed_output.write(f"# Duplicate of sentence #{original_pos}, found at position #{current_pos}\n")
        removed_output.write(sentence + '\n\n')
    
    return (
        cleaned_output.getvalue(),
        removed_output.getvalue(),
        len(sentences),
        len(unique_sentences),
        len(removed_sentences)
    )

def main(file_path):
    if not allowed_file(file_path):
        print('Invalid file type. Only .conll files are allowed')
        return

    try:
        # Read the content
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Process the content
        cleaned_content, removed_content, original_count, unique_count, removed_count = remove_duplicates(content)
        
        # Get next file number and create formatted number string
        file_number = get_next_file_number()
        formatted_number = f"{file_number:04d}"
        
        # Create output filenames
        cleaned_filename = f"cleaned_{formatted_number}.conll"
        removed_filename = f"removed_sentences_{formatted_number}.conll"
        
        # Create full paths
        cleaned_path = os.path.join(OUTPUT_DIR, cleaned_filename)
        removed_path = os.path.join(OUTPUT_DIR, removed_filename)
        
        # Save the files
        with open(cleaned_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
            
        with open(removed_path, 'w', encoding='utf-8') as f:
            f.write(removed_content)
        
        # Print success message with file information
        print(f'File deduplicated successfully:')
        print(f'Cleaned file saved to: {cleaned_path}')
        print(f'Removed sentences file saved to: {removed_path}')
        print(f'Statistics:')
        print(f'Original sentences: {original_count}')
        print(f'Unique sentences: {unique_count}')
        print(f'Duplicates removed: {removed_count}')

    except UnicodeDecodeError:
        print('Invalid file encoding. File must be UTF-8 encoded')
    except Exception as e:
        print(f'An error occurred: {str(e)}')

if __name__ == '__main__':
    # Replace 'path/to/your_file.conll' with the actual file path
    main(r'c:\Users\Sakib Ahmed\Downloads\project-20-at-2024-10-31-22-29-f2d4705b.conll')
