In [1]:
import pandas as pd
from xml.etree import ElementTree as ET
from pathlib import Path

def check_xml_well_formed(xml_path):
    """
    Check if an XML file is well-formed.
    Returns True if well-formed, raises ParseError with details if not.
    """
    try:
        tree = ET.parse(xml_path)
        print(f"✓ XML file '{xml_path}' is well-formed")
        return True
    except ET.ParseError as e:
        print(f"✗ XML parsing error in '{xml_path}':")
        print(f"  Line {e.position[0]}, Column {e.position[1]}")
        print(f"  Error: {str(e)}")
        raise

# Check XML well-formedness before attempting to read
xml_file = 'ICTRP-Results.xml'
if Path(xml_file).exists():
    check_xml_well_formed(xml_file)
    # Read the XML file into a DataFrame using etree parser
    df = pd.read_xml(xml_file, parser='etree')

    # Display basic information about the DataFrame
    print("\nDataFrame Info:")
    print(df.info()) 
else:
    print(f"File not found: {xml_file}")

✓ XML file 'ICTRP-Results.xml' is well-formed
           Export_date  Internal_Number                 TrialID  \
0  03/19/2025 12:20:45         14746198  ChiCTR2500098323\n       
1  03/19/2025 12:20:45         14745527       NCT06864702\n       
2  03/19/2025 12:20:45         14745953  ChiCTR2500098075\n       
3  03/19/2025 12:20:45         14706466   TCTR20250302004\n       
4  03/19/2025 12:20:45         14705480        KCT0010243\n       

     Last_Refreshed_on                                       Public_title  \
0  10 March 2025\n      A prospective, multi-center study to character...   
1  10 March 2025\n      The Construction and Effect Verification of a ...   
2  10 March 2025\n      Diagnosis and treatment of obstructive sleep a...   
3   3 March 2025\n      Accuracy of radiographic fracture detection pr...   
4   3 March 2025\n      Verification of clinical efficacy of artificia...   

                                    Scientific_title  \
0  A prospective, multi-center s

In [8]:
from collections import Counter
# Get all characters from Countries column and count frequencies
chars = ''.join(df.Countries.fillna(''))


char_counts = Counter(chars)
char_counts

Counter({' ': 10764,
         'a': 3108,
         'n': 2661,
         '\n': 2526,
         'i': 2296,
         'e': 1417,
         'h': 1147,
         't': 1111,
         'C': 1097,
         'd': 926,
         'r': 699,
         'l': 573,
         'o': 546,
         ';': 446,
         'p': 414,
         's': 400,
         'S': 388,
         'U': 347,
         'g': 337,
         'y': 336,
         'u': 333,
         'I': 331,
         'm': 300,
         'K': 268,
         'c': 265,
         'T': 160,
         'J': 152,
         'R': 140,
         'w': 139,
         'b': 137,
         'f': 125,
         ',': 118,
         'G': 107,
         'A': 107,
         'F': 101,
         'k': 96,
         'E': 78,
         'N': 76,
         'z': 56,
         'B': 53,
         'P': 36,
         'H': 36,
         'D': 16,
         'M': 14,
         'x': 11,
         'Z': 11,
         '(': 10,
         ')': 10,
         'L': 8,
         'v': 8,
         'V': 5,
         'W': 4,
         'q': 2,
     

In [11]:
# Find all parenthetical text in the Countries column
[chars[i:j+1] for i, j in zip([i for i, c in enumerate(chars) if c == '('], 
                             [i for i, c in enumerate(chars) if c == ')'])]

['(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(except Japan)',
 '(except Japan)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(except Japan)']

In [5]:
import pandas as pd
from pathlib import Path

def find_unused_character(file_path, start=32, end=127):
    """
    Find a character that's not present in the CSV file.
    
    Args:
        file_path: Path to the CSV file
        start: Starting ASCII value (default 32 = space)
        end: Ending ASCII value (default 127 = last printable ASCII)
    
    Returns:
        A character that's not present in the file, or None if all characters are used
    """
    # Read the entire file as text
    text = Path(file_path).read_text(encoding='utf-8')

    returns = {chr(i) for i in range(start, end) if chr(i) not in text}
    
    for i in returns:
        print(f"Found unused character: '{i}' (ASCII: {ord(i)})")
    
    return returns

# Find an unused character in the CSV
csv_file = 'IctrpResults.csv'
if Path(csv_file).exists():
    unused_char = find_unused_character(csv_file)
else:
    print(f"File not found: {csv_file}")

Found unused character: '$' (ASCII: 36)
