In [None]:
import pandas as pd
import numpy as np
import re
import string
import csv
import os
import xml.etree.ElementTree as ET

In [None]:
directory = ('XML FILES')

In [None]:
file_path = ('TEXT FILE OF THE UNIQUE IDs for the PHASE 2 AND 3 NS RCTS')

In [None]:
with open(file_path, 'r') as file:
    unique_ids = [line.strip() for line in file]

In [None]:
id_add = pd.DataFrame({'Unique_ID': unique_ids})

In [None]:
 characteristics_df = pd.DataFrame(columns=['Unique_ID', 'Title', 'Phase', 'Objective', 'End_date','Sample_size', '1ry_endpoint', 'Treatment', 'LT_followup'])

In [None]:
characteristics_df = pd.concat([id_add, characteristics_df], ignore_index=True)

In [None]:
print(characteristics_df.head())

In [None]:
characteristics_df['Unique_ID'] = characteristics_df['Unique_ID'].str.replace('.xml', '', regex=False)

In [None]:
print(characteristics_df.head())

In [None]:
# Title
for index, row in characteristics_df.iterrows():
    unique_id = row['Unique_ID'] + '.xml'  
    file_path = os.path.join(folder_path, unique_id)
    
    if os.path.isfile(file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            study_title_tag = 'Study_Full_Title'  
            title_found = False  
            
            for elem in root.iter(study_title_tag):
                title = elem.find('./value').text  
                
                # Check if 'randomised' or 'randomized' is in the title
                if 'randomised' in title.lower() or 'randomized' in title.lower():
                    characteristics_df.at[index, 'Title'] = title  
                    title_found = True
# Phase
                    phase = title.lower().split('phase') 
                    if len(phase) > 1:
                        phase_number = phase[1].split(' ')[1].translate(str.maketrans('', '', string.punctuation))
                        characteristics_df.at[index, 'Phase'] = phase_number  
                    else:
                        characteristics_df.at[index, 'Phase'] = 0  

                    break  

            if not title_found:
                print(f"No randomised title found for {unique_id}")


# Objective
            rq_tag = 'Principal_Research_Question'
            objective_tag = 'value'

            for elem in root.iter(rq_tag):
                rq = elem.text

                objective_elem = elem.find('./' + objective_tag)  
                objective = objective_elem.text if objective_elem is not None else 'Objective not found'

                characteristics_df.at[index, 'Objective'] = objective

                break
                

# End_date
            red_tag = 'Research_End_Date'
            end_tag = 'value'

            for elem in root.iter(red_tag):
                red = elem.text

                end_elem = elem.find('./' + end_tag)
                end = end_elem.text if end_elem is not None else 'End_date not found'

                characteristics_df.at[index, 'End_date'] = end
                
                break
# Sample_Size
            ssn_tag = 'Total_International_Sample_Size_Number'
            sample_tag = 'value'

            for elem in root.iter(ssn_tag):
                ssn = elem.text

                sample_elem = elem.find('./' + sample_tag)
                sample = sample_elem.text if sample_elem is not None else 'Sample Size not found'

                characteristics_df.at[index, 'Sample_size'] = sample
                
                break
# Primary Endpoint
            pom_tag = 'Primary_Outcome_Measure'
            endpoint_tag = 'value'

            for elem in root.iter(pom_tag):
                pom = elem.text

                endpoint_elem = elem.find('./' + endpoint_tag)
                endpoint = endpoint_elem.text if endpoint_elem is not None else 'Primary endpoint not found'

                characteristics_df.at[index, '1ry_endpoint'] = endpoint
                
                break

# LT-Follow Up
            psd_tag = 'Participants_Study_Duration' # Not quite right either but there isn't an exact 'box'
            follow_tag = 'value'

            for elem in root.iter(psd_tag):
                psd = elem.text

                follow_elem = elem.find('./' + follow_tag)
                follow = follow_elem.text if follow_elem is not None else 'LT follow-up not found'

                characteristics_df.at[index, 'LT_followup'] = follow
                
                break
                
        except ET.ParseError as e:
            print(f"Error parsing {file_path}: {e}")

In [None]:
print(characteristics_df.head())

In [None]:
characteristics_df.to_csv('ns_hra', index=False)