In [1]:
import sys
from enum import Enum
from pydantic import BaseModel, Field
from pydantic_ai import Agent

# Define a simple enum and model to test with
class Rating(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class SentimentAnalysis(BaseModel):
    sentiment: Rating = Field(..., description="The sentiment of the text")
    confidence: float = Field(..., description="Confidence score between 0 and 1")

# Test text
test_text = "I really enjoyed this movie. The acting was superb!"

# Print version info
print(f"Python version: {sys.version}")
print(f"Pydantic-AI version: {getattr(Agent, '__version__', 'unknown')}")
print("\nTesting different agent initialization methods...\n")

# Method 1: Standard response_model approach
try:
    print("Testing Method 1: response_model parameter...")
    agent1 = Agent(
        model="gpt-4o-mini",
        system_prompt="You are a sentiment analysis assistant.",
        response_model=SentimentAnalysis,
    )
    result1 = agent1.run(f"Analyze the sentiment: {test_text}")
    print(f"✅ SUCCESS: {result1.data}")
except Exception as e:
    print(f"❌ FAILED: {str(e)}")

# Method 2: Type parameter approach
try:
    print("\nTesting Method 2: Using Agent[Model] syntax...")
    agent2 = Agent[SentimentAnalysis](
        model="gpt-4o-mini",
        system_prompt="You are a sentiment analysis assistant.",
    )
    result2 = agent2.run(f"Analyze the sentiment: {test_text}")
    print(f"✅ SUCCESS: {result2.data}")
except Exception as e:
    print(f"❌ FAILED: {str(e)}")

# Method 3: model_name and schema approach (older versions)
try:
    print("\nTesting Method 3: Using model_name and schema parameters...")
    agent3 = Agent(
        model="gpt-4o-mini",
        system_prompt="You are a sentiment analysis assistant.",
        model_name=SentimentAnalysis.__name__,
        schema=SentimentAnalysis.model_json_schema()
    )
    result3 = agent3.run(f"Analyze the sentiment: {test_text}")
    print(f"✅ SUCCESS: {result3.data}")
except Exception as e:
    print(f"❌ FAILED: {str(e)}")

# Method 4: Try import create_agent if available
try:
    print("\nTesting Method 4: Using create_agent function...")
    try:
        from pydantic_ai import create_agent
        agent4 = create_agent(
            model="gpt-4o-mini",
            system_prompt="You are a sentiment analysis assistant.",
            output_model=SentimentAnalysis
        )
        result4 = agent4.run(f"Analyze the sentiment: {test_text}")
        print(f"✅ SUCCESS: {result4.data}")
    except ImportError:
        print("❌ FAILED: create_agent function not available in this version")
except Exception as e:
    print(f"❌ FAILED: {str(e)}")

print("\nTest complete. Use the method that succeeded.")

Python version: 3.12.9 (main, Feb  4 2025, 00:00:00) [GCC 14.2.1 20250110 (Red Hat 14.2.1-7)]
Pydantic-AI version: unknown

Testing different agent initialization methods...

Testing Method 1: response_model parameter...
❌ FAILED: Agent.__init__() got an unexpected keyword argument 'response_model'

Testing Method 2: Using Agent[Model] syntax...
❌ FAILED: 'coroutine' object has no attribute 'data'

Testing Method 3: Using model_name and schema parameters...
❌ FAILED: Agent.__init__() got an unexpected keyword argument 'model_name'

Testing Method 4: Using create_agent function...
❌ FAILED: create_agent function not available in this version

Test complete. Use the method that succeeded.


In [7]:
from pydantic_ai import Agent

testing_agent = Agent(
    model="gpt-4o-mini",
    system_prompt="You are a sentiment analysis assistant.",
    result_type=Rating,
)

test_text = "I really enjoyed this movie. The acting was not bad!"

result = await testing_agent.run(f"Analyze the sentiment: {test_text}")

print(result.data)

Rating.POSITIVE


In [12]:
df['Public_title'].head()

0    Construction and Efficacy Evaluation of an Art...
1    Construction of a new diagnostic model for spo...
2    The Research and Development and Translation o...
3    Auxiliary diagnosis and treatment framework of...
4    Multi-center study on the effectiveness of art...
Name: Public_title, dtype: object

In [9]:
import pandas as pd
from xml.etree import ElementTree as ET
from pathlib import Path

def check_xml_well_formed(xml_path):
    """
    Check if an XML file is well-formed.
    Returns True if well-formed, raises ParseError with details if not.
    """
    try:
        tree = ET.parse(xml_path)
        print(f"✓ XML file '{xml_path}' is well-formed")
        return True
    except ET.ParseError as e:
        print(f"✗ XML parsing error in '{xml_path}':")
        print(f"  Line {e.position[0]}, Column {e.position[1]}")
        print(f"  Error: {str(e)}")
        raise

# Check XML well-formedness before attempting to read
xml_file = 'ICTRP-Results.xml'
if Path(xml_file).exists():
    check_xml_well_formed(xml_file)
    # Read the XML file into a DataFrame using etree parser
    df = pd.read_xml(xml_file, parser='etree')

    # Display basic information about the DataFrame
    print("\nDataFrame Info:")
    print(df.info()) 
else:
    print(f"File not found: {xml_file}")

  from pandas._libs import (


✓ XML file 'ICTRP-Results.xml' is well-formed

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2718 entries, 0 to 2717
Data columns (total 49 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Export_date                    2718 non-null   object
 1   Internal_Number                2718 non-null   int64 
 2   TrialID                        2718 non-null   object
 3   Last_Refreshed_on              2718 non-null   object
 4   Public_title                   2718 non-null   object
 5   Scientific_title               2717 non-null   object
 6   Primary_sponsor                2717 non-null   object
 7   Retrospective_flag             2718 non-null   object
 8   Date_registration3             2718 non-null   int64 
 9   Date_registration              2718 non-null   object
 10  Source_Register                2718 non-null   object
 11  web_address                    2718 non-null   object
 12 

In [None]:
import openai
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Literal, Optional
from enum import Enum

# Load environment variables
load_dotenv()

# Configure OpenAI client
client = openai.OpenAI()

# Define sponsor categories as an enum
class SponsorType(str, Enum):
    UNIVERSITY = 'university'
    HOSPITAL = 'hospital'
    GOVERNMENT = 'government institution'
    FOUNDATION = 'foundation'
    COMPANY = 'private company'
    INDIVIDUAL = 'individual (person)'
    RESEARCH = 'research center'
    CLINIC = 'private clinic'
    NON_PROFIT = 'non-profit health institution'
    UNCERTAIN = 'uncertain'

# Define AI request/response models
class ClassificationRequest(BaseModel):
    sponsor: str
    model: str = Field(default="gpt-4o-mini")
    temperature: float = Field(default=0)
    max_tokens: int = Field(default=20)

class Classifier:

    def __init__(self, categoryType, thing_to_classify):
        self.categories = categoryType
        self.thing_to_classify = thing_to_classify
        self.client = openai.OpenAI()
        
        # Define ClassificationResponse here, with access to self.categories
        class ClassificationResponse(BaseModel):
            category: categoryType
        
        # Store it as an attribute of the instance
        self.ClassificationResponse = ClassificationResponse

    def classify(self, request: ClassificationRequest) -> BaseModel:
        prompt = f"""Classify the following {self.thing_to_classify} into exactly one of these categories:
{[t.value for t in self.categories]}

<input> {request.sponsor} </input>

Return only the category name, nothing else."""

        response = self.client.chat.completions.create(
            model=request.model,
            messages=[
                {"role": "system", "content": f"You are a helpful assistant that classifies {self.thing_to_classify} into categories."},
                {"role": "user", "content": prompt}
            ],
            temperature=request.temperature,
            max_tokens=request.max_tokens
        )
        
        try:
            category = response.choices[0].message.content.strip().lower()
            return self.ClassificationResponse(
                category=SponsorType(category)
            )
        except:
            return self.ClassificationResponse(
                category=SponsorType.UNCERTAIN
            )

# Initialize classifier
sponsor_classifier = Classifier(SponsorType, "study sponsors")

# Get first 5 rows and classify sponsors
first_5 = df.head(5).copy()
first_5['Sponsor_Type'] = first_5['Primary_sponsor'].apply(
    lambda x: sponsor_classifier.classify(
        ClassificationRequest(sponsor=x)
    ).category.value
)

# Display results
print("\nClassification Results:")
print(first_5[['Primary_sponsor', 'Sponsor_Type']])



Classification Results:
                                     Primary_sponsor Sponsor_Type
0  First Affiliated Hospital of Sun Yat-sen Unive...     hospital
1                            Zhujiang Hospital\n         hospital
2                 Shanghai Children's Hospital\n         hospital
3                                          N/A\n        uncertain
4                     Inha University Hospital\n         hospital


In [15]:
# Check if countries are only separated by semicolons
# First, let's look at the Countries column
countries_sample = df.Countries.dropna().head(10)
print("Sample of Countries column:")
print(countries_sample)

# Check for different separators in the Countries column
import re

# Function to check for common separators
def check_separators(text):
    separators = {
        'semicolon': ';',
        'comma': ',',
        'pipe': '|',
        'tab': '\t',
        'newline': '\n'
    }
    
    results = {}
    for name, sep in separators.items():
        if pd.isna(text):
            results[name] = False
        else:
            results[name] = sep in text
    
    return results

# Apply the function to each non-null value in the Countries column
separator_results = df.Countries.dropna().apply(check_separators)

# Count occurrences of each separator
separator_counts = {
    sep: sum(result[sep] for result in separator_results) 
    for sep in ['semicolon', 'comma', 'pipe', 'tab', 'newline']
}

print("\nSeparator counts in Countries column:")
for sep, count in separator_counts.items():
    print(f"{sep}: {count}")

# Check if semicolon is the only separator used
only_semicolons = all(
    result['semicolon'] and not any(result[sep] for sep in ['comma', 'pipe', 'tab', 'newline'])
    for result in separator_results
)

print(f"\nAre countries only separated by semicolons? {only_semicolons}")




Sample of Countries column:
0    China\n    
1    China\n    
2    China\n    
3    China\n    
4    China\n    
5    China\n    
6    China\n    
7    China\n    
8    Spain\n    
9    China\n    
Name: Countries, dtype: object

Separator counts in Countries column:
semicolon: 138
comma: 108
pipe: 0
tab: 0
newline: 2564

Are countries only separated by semicolons? False


In [3]:
# Check for countries separated by newlines
# Make sure we're not just detecting newlines at the end of strings
newline_separated = df.Countries.dropna().apply(lambda x: '\n' in x.strip())
newline_count = newline_separated.sum()

print(f"\nNumber of rows with countries separated by newlines: {newline_count}")

if newline_count > 0:
    print("\nExamples of countries with newline separators:")
    newline_examples = df[newline_separated].Countries.head(5)
    for i, example in enumerate(newline_examples):
        print(f"\nExample {i+1}:")
        print(example)



Number of rows with countries separated by newlines: 0


In [5]:
# Check if all TrialIDs are unique
if 'TrialID' in df.columns:
    total_trials = len(df)
    unique_trials = df['TrialID'].nunique()
    duplicate_count = total_trials - unique_trials
    
    print(f"\nTotal number of trials: {total_trials}")
    print(f"Number of unique trial IDs: {unique_trials}")
    print(f"Number of duplicate trial IDs: {duplicate_count}")
    
    if duplicate_count > 0:
        print("\nExamples of duplicate trial IDs:")
        # Find trial IDs that appear more than once
        duplicate_ids = df['TrialID'].value_counts()
        duplicate_ids = duplicate_ids[duplicate_ids > 1]
        
        # Display the first 5 duplicate IDs and their counts
        for trial_id, count in duplicate_ids.head(5).items():
            print(f"Trial ID: {trial_id}, appears {count} times")
else:
    print("\nNo 'TrialID' column found in the dataframe")



Total number of trials: 2681
Number of unique trial IDs: 2681
Number of duplicate trial IDs: 0


In [43]:
df.loc[1]['Countries']

'China\n    '

In [55]:
# First, prepare the countries data - convert to lowercase for case-insensitive matching
countries_series = df['Countries'].dropna().str.replace(r'[;,]', '|', regex=True).str.split('|', expand=False)

# Get all unique countries
all_countries = set()
for country_list in countries_series:
    if isinstance(country_list, list):  # Check if it's a list (not NaN)
        all_countries.update([country.strip().lower() for country in country_list if country.strip()])

# Create a copy of the dataframe
df_copy = df.copy()

# Initialize all country columns as False
for country in all_countries:
    df_copy[f'has_{country.replace(" ", "_")}'] = False
    
# Fill in True values where appropriate
for i, row in df_copy.iterrows():
    if pd.notna(row['Countries']):
        countries_lower = row['Countries'].lower()
        for country in all_countries:
            if country.lower()+',' in countries_lower or country.lower()+';' in countries_lower or country.lower()+"\n" in countries_lower:
                df_copy.at[i, f'has_{country.replace(" ", "_")}'] = True

df_copy.head()

Unnamed: 0,Export_date,Internal_Number,TrialID,Last_Refreshed_on,Public_title,Scientific_title,Primary_sponsor,Retrospective_flag,Date_registration3,Date_registration,Source_Register,web_address,Recruitment_Status,other_records,Inclusion_agemin,Inclusion_agemax,Inclusion_gender,Date_enrollement,Target_size,Study_type,Study_design,Phase,Countries,Contact_Firstname,Contact_Lastname,Contact_Address,Contact_Email,Contact_Tel,Contact_Affiliation,Inclusion_Criteria,Exclusion_Criteria,Condition,Intervention,Primary_outcome,Source_Support,Ethics_review_status,Ethics_review_approval_date,Ethics_review_contact_name,Ethics_review_contact_address,Ethics_review_contact_phone,Ethics_review_contact_email,results_url_link,Secondary_outcome,Acronym,Secondary_ID,Secondary_Sponsor,results_yes_no,results_date_completed,results_date_posted,has_hong_kong,has_republic_of,has_chian,has_south_america,has_pakistan,has_romania,has_egypt,has_kenya,has_spain,has_croatia,has_france,has_chnia,has_lithuania,has_rwanda,has_argentina,has_portugal,has_beijing,has_america,has_latvia,has_united_states_of_america,has_north_america,has_norway,has_czechia,has_united_arab_emirates,has_israel,has_georgia,has_philippines,has_thailand,has_belgium,has_japan,has_usa,has_ukraine,has_england,has_northern_ireland,has_switzerland,has_cameroon,has_bolivia,has_colombia,has_south,has_canada,has_uganda,has_chile,has_slovakia,has_china,has_malaysia,has_australia,has_martinique,has_netherlands,has_russian_federation,has_nigeria,has_singapore,has_denmark,has_taiwan,has_luxembourg,has_cyprus,has_indonesia,has_europe,has_south_africa,has_turkey,has_serbia,has_united_kingdom,has_iran_(islamic_republic_of),has_new_zealand,has_brazil,has_germany,has_jordan,has_asia(except_japan),has_austria,has_syrian_arab_republic,has_italy,has_finland,has_saudi_arabia,has_india,has_slovenia,has_vietnam,has_united_states,has_greece,has_tanzania,has_bangladesh,has_lebanon,has_hungary,has_czech_republic,has_africa,has_wales,has_zambia,has_ireland,has_c,has_zimbabwe,has_poland,has_sweden,has_scotland,has_korea,has_ecuador,has_mexico
0,04/10/2025 15:38:13,14773158,ChiCTR2500099797\n,31 March 2025\n,Construction and Efficacy Evaluation of an Art...,Construction and Efficacy Evaluation of an Art...,The international peace maternity and child he...,No\n,20250328,2025-03-28\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Recruiting\n,No\n,17\n,83\n,Female\n,2025-02-10\n,non-benign:122;benign:14068;\n,Observational study\n,Cross-sectional\n,N/A\n,China\n,Cao Wenjiao\n,,910 Hengshan Road\n,120660017@qq.com\n,+86 15801811479\n,The international peace maternity and child he...,Inclusion criteria: 1.No surgical contraindica...,Exclusion criteria: 1.Presence of concurrent m...,Uterine sarcoma and fibroids\n,non-benign:Uterine sarcoma and tumors of uncer...,risk factor;\n,Self-Funded\n,Approved\n,06/02/2025\n,jxzhangyanfei@163.com\n,Zhang YanFei\n,+86 21 64070434\n,jxzhangyanfei@163.com\n,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,04/10/2025 15:38:13,14772949,ChiCTR2500099577\n,31 March 2025\n,Construction of a new diagnostic model for spo...,Construction of a new diagnostic model for spo...,Second Affiliated Hospital of Chongqing Medica...,Yes\n,20250326,2025-03-26\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,18\n,,Both\n,2025-04-16\n,SBP group:1000;non-SBP group:3000;\n,Observational study\n,Case-Control study\n,Retrospective study\n,China\n,Zhou Zhihang\n,,"No. 76 Linjiang Road, Yuzhong District, Chongq...",zhouzhihang@cqmu.edu.cn\n,+86 178 1518 3428\n,"department of Gastroenterology, Second Affilia...",Inclusion criteria: 1. Patients with thoracic ...,Exclusion criteria: 1.Non-cirrhotic pleural ef...,Spontaneous peritonitis;DC50.00 \n,SBP group:None;non-SBP group:nNone;\n,Routine examination of ascites;\n,Optional Topics\n,Approved\n,07/03/2025\n,,Fang Xiongying\n,+86 62888436\n,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,04/10/2025 15:38:13,14772873,ChiCTR2500099497\n,31 March 2025\n,The Research and Development and Translation o...,The Research and Development and Translation o...,"Shanghai Stomatological Hospital, Fudan Univer...",Yes\n,20250325,2025-03-25\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,,,Both\n,2025-04-01\n,Group A: Full-time retention using a Hawley re...,Interventional study\n,Parallel\n,N/A\n,China\n,Zhicheng Gong\n,,"356 East Beijing Road, Huangpu District, Shang...",387888827@qq.com\n,+86 13761806588\n,"Shanghai Stomatological Hospital, Fudan Univer...",Inclusion criteria: 1. The brackets have not b...,Exclusion criteria: 1.Orthodontic treatment ha...,Dental Malocclusion\n,Group A: Full-time retention using a Hawley re...,PAR Index;\n,Shanghai Municipal Health Commission's Seed Pr...,Approved\n,21/02/2025\n,huangwei523@126.com\n,Huang Yu\n,+86 21 63502165\n,huangwei523@126.com\n,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,04/10/2025 15:38:13,14772879,ChiCTR2500099503\n,31 March 2025\n,Auxiliary diagnosis and treatment framework of...,Auxiliary diagnosis and treatment framework of...,Hainan General Hospital\n,Yes\n,20250325,2025-03-25\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,18\n,65\n,Both\n,2025-04-01\n,Observation group:1000;\n,Observational study\n,Sequential\n,N/A\n,China\n,Weiyuan Huang\n,,"No. 14, Xiuhua Road, Haikou City, Hainan Provi...",weiyuanhuang@163.com\n,+86 184 8997 8445\n,Hainan General Hospital\n,Inclusion criteria: 1. Patients with nasophary...,"Exclusion criteria: 1. Patient claustrophobia,...",Nasopharyngeal carcinoma\n,Observation group:None;\n,DWI parameters;DCE parameters;\n,"Department of Radiology, Hainan General Hospit...",Approved\n,05/03/2025\n,,Nan Chen\n,+86 187 8928 5086\n,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,04/10/2025 15:38:13,14772925,ChiCTR2500099551\n,31 March 2025\n,Multi-center study on the effectiveness of art...,Multi-center study on the effectiveness of art...,"Affiliated hangzhou first people's hospital, z...",Yes\n,20250325,2025-03-25\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,40\n,69\n,Female\n,2025-04-01\n,Target condition:30;Difficult condition:9970\n...,Diagnostic test\n,Diagnostic test for accuracy\n,N/A\n,China\n,Lingyun Bao\n,,"No.261 Huansha Road, Hangzhou, Zhejiang Provin...",hzs1bly@zju.edu.cn\n,+86 571 56007130\n,"Affiliated hangzhou first people's hospital, z...",Inclusion criteria: 1.10000 women aged 40-69 (...,"Exclusion criteria: 1. Pregnant, breastfeeding...",Breast cancer is one of the major malignant tu...,Gold Standard:Obtain final pathological result...,recall rate;detection rate;\n,Projects commissioned by enterprises and insti...,Approved\n,17/03/2025\n,1742208034@qq.com\n,Lu Yun\n,+86 571 56007507\n,1742208034@qq.com\n,,sensitivity;specificity;Positive predictive va...,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [56]:
# Count the number of true elements for each country indicator column
print("Number of trials per country:")
country_counts = {}

for col in df_copy.columns:
    if col.startswith('has_'):
        # Sum the boolean values to get count of True values
        count = df_copy[col].sum()
        country_name = col[4:].replace('_', ' ')  # Remove 'has_' prefix and replace underscores
        country_counts[country_name] = count

# Sort by count in descending order and display
for country, count in sorted(country_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{country}: {count}")
 
#sum of all counts  vs number of rows
print(f"Sum of all counts: {sum(country_counts.values())}")
print(f"Number of rows: {len(df_copy)}")



Number of trials per country:
china: 1033
united states: 184
india: 169
japan: 143
united kingdom: 117
korea: 105
republic of: 104
italy: 104
germany: 86
france: 77
spain: 72
turkey: 66
taiwan: 54
australia: 50
england: 43
canada: 36
netherlands: 34
thailand: 33
hong kong: 28
egypt: 23
switzerland: 22
singapore: 22
belgium: 19
brazil: 19
sweden: 19
norway: 15
denmark: 15
austria: 14
israel: 12
poland: 11
russian federation: 9
new zealand: 8
pakistan: 7
argentina: 7
portugal: 7
iran (islamic republic of): 7
america: 6
malaysia: 6
ireland: 6
greece: 5
mexico: 5
united states of america: 4
bangladesh: 4
africa: 4
wales: 4
c: 4
scotland: 4
ecuador: 4
romania: 3
czechia: 3
cyprus: 3
indonesia: 3
europe: 3
south africa: 3
jordan: 3
asia(except japan): 3
vietnam: 3
tanzania: 3
hungary: 3
lithuania: 2
latvia: 2
united arab emirates: 2
georgia: 2
philippines: 2
ukraine: 2
northern ireland: 2
bolivia: 2
uganda: 2
chile: 2
slovakia: 2
luxembourg: 2
finland: 2
saudi arabia: 2
slovenia: 2
czech rep

In [13]:
# Display all columns in the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Filter and display rows where has_C is True
df[df["has_C"]==True].head()

# Reset display options to default (optional)
# pd.reset_option('display.max_columns')
# pd.reset_option('display.width')

Unnamed: 0,Export_date,Internal_Number,TrialID,Last_Refreshed_on,Public_title,Scientific_title,Primary_sponsor,Retrospective_flag,Date_registration3,Date_registration,Source_Register,web_address,Recruitment_Status,other_records,Inclusion_agemin,Inclusion_agemax,Inclusion_gender,Date_enrollement,Target_size,Study_type,Study_design,Phase,Countries,Contact_Firstname,Contact_Lastname,Contact_Address,Contact_Email,Contact_Tel,Contact_Affiliation,Inclusion_Criteria,Exclusion_Criteria,Condition,Intervention,Primary_outcome,Secondary_outcome,Source_Support,Ethics_review_status,Ethics_review_approval_date,Ethics_review_contact_name,Ethics_review_contact_address,Ethics_review_contact_phone,Ethics_review_contact_email,results_url_link,Acronym,Secondary_ID,results_yes_no,results_date_completed,Secondary_Sponsor,results_date_posted,Country,has_Africa,has_America,has_Argentina,has_Asia(except_Japan),has_Australia,has_Austria,has_Bangladesh,has_Beijing,has_Belgium,has_Bolivia,has_Brazil,has_C,has_CHINA,has_Cameroon,has_Canada,has_Chian,has_Chile,has_China,has_Chnia,has_Colombia,has_Croatia,has_Cyprus,has_Czech_Republic,has_Czechia,has_Denmark,has_Ecuador,has_Egypt,has_England,has_Europe,has_Finland,has_France,has_Georgia,has_Germany,has_Greece,has_Hong_Kong,has_Hungary,has_India,has_Indonesia,has_Iran_(Islamic_Republic_of),has_Ireland,has_Israel,has_Italy,has_Japan,has_Jordan,has_Kenya,has_Korea,has_Latvia,has_Lebanon,has_Lithuania,has_Luxembourg,has_Malaysia,has_Martinique,has_Mexico,has_Netherlands,has_New_Zealand,has_Nigeria,has_North_America,has_Northern_Ireland,has_Norway,has_Pakistan,has_Philippines,has_Poland,has_Portugal,has_Republic_of,has_Romania,has_Russian_Federation,has_Rwanda,has_Saudi_Arabia,has_Scotland,has_Serbia,has_Singapore,has_Slovakia,has_Slovenia,has_South,has_South_Africa,has_South_America,has_Spain,has_Sweden,has_Switzerland,has_Syrian_Arab_Republic,has_Taiwan,has_Tanzania,has_Thailand,has_Turkey,has_USA,has_Uganda,has_Ukraine,has_United_Arab_Emirates,has_United_Kingdom,has_United_States,has_United_States_of_America,has_Vietnam,has_Wales,has_Zambia,has_Zimbabwe,has_china
0,03/19/2025 12:20:45,14746198.0,ChiCTR2500098323\n,10 March 2025\n,"A prospective, multi-center study to character...","A prospective, multi-center study to character...",First Affiliated Hospital of Sun Yat-sen Unive...,Yes\n,20250306.0,2025-03-06\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,18\n,,Both\n,2025-04-14\n,Target condition:234;Difficult condition:46\n,Diagnostic test\n,Diagnostic test for accuracy\n,N/A\n,China\n,Minhu Chen\n,,"No.58 Zhongshan Second Road, Yuexiu District, ...",chenminhu@mail.sysu.edu.cn\n,+86 138 0295 7089\n,"The First Affiliated Hospital,Sun Yat-sen Univ...",Inclusion criteria: (1) Patients Over 18 years...,Exclusion criteria: (1) Cannot undergo MRI exa...,Crohn's disease\n,Gold Standard:Masson’s trichrome staining of s...,Pathological score;Magnetic Resonance Enterogr...,Crohn's Disease Activity Index;IBD Montreal cl...,"MSD R&D(CHINA)CO.,LTD\n",Approved\n,23/12/2024\n,,Zhanyong Chen\n,+86 20 8733 8035\n,,,,,,,,,China,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,03/19/2025 12:20:45,14745527.0,NCT06864702\n,10 March 2025\n,The Construction and Effect Verification of a ...,The Construction and Effect Verification of a ...,Zhujiang Hospital\n,No\n,20250304.0,04/03/2025\n,ClinicalTrials.gov\n,https://clinicaltrials.gov/ct2/show/NCT0686470...,Recruiting\n,No\n,18 Years\n,85 Years\n,All\n,"December 20, 2023\n",220\n,Interventional\n,Allocation: N/A. Intervention model: Single Gr...,N/A\n,China\n,,,,,,,Inclusion Criteria:\r<br>\r<br> - Patients w...,,Artificial Intelligence (AI);Deep Learning;Lap...,Behavioral: Whether the patient received diagn...,F1 score\n,IoU\n,Please refer to primary and secondary sponsors...,,,,,,,,,2024-KY-152-02\n,,,,,China,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,03/19/2025 12:20:45,14745953.0,ChiCTR2500098075\n,10 March 2025\n,Diagnosis and treatment of obstructive sleep a...,Diagnosis and treatment of obstructive sleep a...,Shanghai Children's Hospital\n,Yes\n,20250303.0,2025-03-03\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,0\n,18\n,Both\n,2025-03-10\n,Target condition:1600;Difficult condition:0\n,Diagnostic test\n,Diagnostic test for accuracy\n,N/A\n,China\n,Xu Hongming\n,,"355 Luding Road, Putuo District, Shanghai, Chi...",xuhongming@188.com\n,+86 139 1708 6351\n,Shanghai Children's Hospital\n,Inclusion criteria: (1) Children aged 0-18 yea...,Exclusion criteria: (1) Children with adenoids...,Obstructive sleep apnea in children\n,Gold Standard:Mainly based on the clinical gui...,Nasopharyngoscopy image (Adenoid and tonsil gr...,,"Shanghai Jiao Tong University ""Star of Jiao To...",Approved\n,14/02/2025\n,hl@shchildren.com.cn\n,He Lei\n,+86 21 5297 6581\n,hl@shchildren.com.cn\n,,,,,,,,China,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,03/19/2025 12:20:45,14702355.0,ChiCTR2500097883\n,3 March 2025\n,A survey on the attitude level of nurses in te...,A survey on the attitude level of nurses in te...,"West China Hospital,Sichuan University\n",Yes\n,20250226.0,2025-02-26\n,ChiCTR\n,https://www.chictr.org.cn/showproj.html?proj=2...,Not Recruiting\n,No\n,18\n,70\n,Both\n,2025-03-01\n,Observation group:1000;\n,Observational study\n,Cross-sectional\n,N/A\n,China\n,Linli Guo\n,,"No.37 Guoxue Alley,Wuhou District,Chengdu City...",704367433@qq.com\n,+86 181 2339 3602\n,"West China Hospital,Sichuan University\n",Inclusion criteria: 1. Nursing staff who volun...,Exclusion criteria: 1. People who are unwillin...,NA\n,Observation group:NA;\n,General attitude towards artificial intelligen...,Artificial Intelligence Anxiety;\n,NA\n,Approved\n,11/10/2024\n,,Shaolin Deng\n,+86 28 8542 2654\n,,,,,,,,,England,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,03/19/2025 12:20:45,14744865.0,NCT06856096\n,10 March 2025\n,PET/MRI Artificial Intelligence Reconstruction...,PET/MRI Artificial Intelligence Reconstruction...,Xijing Hospital\n,No\n,20250226.0,26/02/2025\n,ClinicalTrials.gov\n,https://clinicaltrials.gov/ct2/show/NCT0685609...,Not recruiting\n,No\n,18 Years\n,75 Years\n,All\n,"February 10, 2025\n",65\n,Interventional\n,Allocation: N/A. Intervention model: Sequentia...,N/A\n,China\n,,,,,,,Inclusion Criteria:\r<br>\r<br> - \r<br>\r<b...,,Parkinson&#39;s Syndrome\n,Diagnostic Test: PET/MRI\n,Signal to Noise Ratio;Contrast to Noise Ratio\...,,Please refer to primary and secondary sponsors...,,,,,,,,,KY20252044\n,,,,,United Kingdom,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
from collections import Counter
# Get all characters from Countries column and count frequencies
chars = ''.join(df.Countries.fillna(''))


char_counts = Counter(chars)
char_counts

Counter({' ': 10764,
         'a': 3108,
         'n': 2661,
         '\n': 2526,
         'i': 2296,
         'e': 1417,
         'h': 1147,
         't': 1111,
         'C': 1097,
         'd': 926,
         'r': 699,
         'l': 573,
         'o': 546,
         ';': 446,
         'p': 414,
         's': 400,
         'S': 388,
         'U': 347,
         'g': 337,
         'y': 336,
         'u': 333,
         'I': 331,
         'm': 300,
         'K': 268,
         'c': 265,
         'T': 160,
         'J': 152,
         'R': 140,
         'w': 139,
         'b': 137,
         'f': 125,
         ',': 118,
         'G': 107,
         'A': 107,
         'F': 101,
         'k': 96,
         'E': 78,
         'N': 76,
         'z': 56,
         'B': 53,
         'P': 36,
         'H': 36,
         'D': 16,
         'M': 14,
         'x': 11,
         'Z': 11,
         '(': 10,
         ')': 10,
         'L': 8,
         'v': 8,
         'V': 5,
         'W': 4,
         'q': 2,
     

In [17]:
# all the options for Target_size that contain : or ;
df[df.Target_size.str.contains(r'[;:]')].Target_size.unique()

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [69]:
#give me a simple test dataframe
df_testing = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df_testing.sample(frac=1)

Unnamed: 0,A,B
0,1,4
2,3,6
1,2,5


In [11]:
# Find all parenthetical text in the Countries column

[chars[i:j+1] for i, j in zip([i for i, c in enumerate(chars) if c == '('], 
                             [i for i, c in enumerate(chars) if c == ')'])]

['(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(except Japan)',
 '(except Japan)',
 '(Islamic Republic of)',
 '(Islamic Republic of)',
 '(except Japan)']

In [5]:
import pandas as pd
from pathlib import Path

def find_unused_character(file_path, start=32, end=127):
    """
    Find a character that's not present in the CSV file.
    
    Args:
        file_path: Path to the CSV file
        start: Starting ASCII value (default 32 = space)
        end: Ending ASCII value (default 127 = last printable ASCII)
    
    Returns:
        A character that's not present in the file, or None if all characters are used
    """
    # Read the entire file as text
    text = Path(file_path).read_text(encoding='utf-8')

    returns = {chr(i) for i in range(start, end) if chr(i) not in text}
    
    for i in returns:
        print(f"Found unused character: '{i}' (ASCII: {ord(i)})")
    
    return returns

# Find an unused character in the CSV
csv_file = 'IctrpResults.csv'
if Path(csv_file).exists():
    unused_char = find_unused_character(csv_file)
else:
    print(f"File not found: {csv_file}")

Found unused character: '$' (ASCII: 36)


In [21]:
# Print all target sizes containing exactly none of : or ; but containing a-z

target_sizes_with_letters = df[
    (~df.Target_size.str.contains(r'[;:]', na=False)) & 
    (df.Target_size.str.contains(r'[a-z]', na=False))
].Target_size.unique()

print("Target sizes containing letters but no ':' or ';':")
for size in target_sizes_with_letters:
    print(size)

Target sizes containing letters but no ':' or ';':


In [41]:
# Print all target sizes containing exactly none of : or ; but containing a-z

target_sizes_with_letters = df[
    (df.Target_size.str.contains(r'[;:]', na=False)) 
]

# print("Target sizes containing ':' or ';':")
# for size in target_sizes_with_letters:
#     print(size)

target_sizes_with_letters.total_size = target_sizes_with_letters.Target_size.apply(convert_to_int).astype('Int64')
target_sizes_with_letters[["total_size", "Target_size"]].head()

NameError: name 'string' is not defined

In [15]:

#check if all target sizes without ; or : can be converted to numbers
no_colons = df[~df.Target_size.str.contains(r'[;:]', na=False)]
yes_colons = df[df.Target_size.str.contains(r'[;:]', na=False)]
to_int = no_colons.Target_size

# Try converting each value to int and print any that fail
for val in to_int:
    try:
        int(val)
    except (ValueError, TypeError):
        if pd.notna(val): # Only print if not NaN
            print(f"Could not convert to integer: {val}")

no_colons.Target_size.head()
# no_colons.int_Target = pd.to_numeric(no_colons.Target_size, errors='coerce')
# no_colons[["int_Target", "Target_size"]].head()

def convert_to_int(val:str)->int:
    if not val: return None

    if ';' not in val:
        ret = int(val[:-1])
        return ret
    else:
        groups = val.split(';')
        accumulator = 0
        for group in groups:
            try:
                accumulator += int(group.split(':')[-1])
            except (ValueError, TypeError):
                pass
        return accumulator
    
yes_colons["total_size"] = yes_colons.Target_size.apply(convert_to_int).astype('Int64').head()

yes_colons[["Target_size", "total_size"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yes_colons["total_size"] = yes_colons.Target_size.apply(convert_to_int).astype('Int64').head()


Unnamed: 0,Target_size,total_size
0,Target condition:234;Difficult condition:46\n,280
2,Target condition:1600;Difficult condition:0\n,1600
6,Observation group:1000;\n,1000
11,AI Establishment:500;External Validation:100;\...,600
12,Thyroid function group:100;Normal control grou...,200


In [5]:
df[["Public_title"]].to_csv('public_titles.csv')