In [1]:
from rdflib import Namespace, Graph
from rdflib.namespace import NamespaceManager, OWL, RDF, RDFS, XSD
import re
from decimal import Decimal

In [2]:
def append_file(source_file, target_file):
    with open(source_file, 'r') as src, open(target_file, 'a+') as tgt:
        tgt.write(src.read())

In [3]:
prefixes = [
"@prefix owl: <http://www.w3.org/2002/07/owl#> .\n",
"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
"@prefix xml: <http://www.w3.org/XML/1998/namespace> .\n",
"@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n",
"@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
"@prefix ex: <http://example.org/ontology#> . \n"
"@base <http://example.org/ontology/> .\n"]

In [4]:
def escape_special_chars(match):
    word = match.group()
    # Define special characters to escape (only if not already escaped)
    special_chars = r'([&\.\^\$\*\+\?\{\}\[\]\|()])'  # Removed extra backslash

    # Apply escaping only if the character is NOT already escaped
    escaped_word = re.sub(r'(?<!\\)' + special_chars, r'\\\1', word)
    return escaped_word

# def process_text(text):
#     # Define the pattern to match words starting with 'news:'
#     pattern = r'\bnews:\S*'
#     # Apply the substitution using the escape_special_chars function
#     processed_text = re.sub(pattern, escape_special_chars, text)
#     return processed_text

In [15]:
# Function to clean and convert percentage values to decimal
def clean_percentage_to_decimal(match):
    value = match.group(1).strip('"')  # Extract the value (remove quotes)
    if "%" in value:
        value = value.replace('%', '')  # Remove the percentage symbol
        # Convert to decimal (e.g., 20% becomes 0.2)
        decimal_value = Decimal(value) / 100
        return f'"{decimal_value}"^^xsd:decimal'
    else:
        # Return the value as is if it's already a valid decimal
        return match.group(0)

In [16]:
def process_text(line):
    pattern = r'(".*?")\^\^xsd:decimal'
    return re.sub(pattern, clean_percentage_to_decimal, line)

In [17]:
def process_lines(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Create a new list with prefixes at the beginning
    processed_lines = prefixes[:]  # Copy the list of prefixes
    i = 0  # Start from the first line of the input file

    while i < len(lines):
        line = lines[i].strip()
        if line.startswith('@') or line.startswith('<'):
            # Skip lines starting with '@' or '<'
            i += 1
        else:
            # Process and append the line
            processed_lines.append(process_text(line) + '\n')  # Ensure newline formatting
            i += 1

    # Write the processed lines to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(processed_lines)