In [56]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# extracting info from a series of xml files

In [None]:
def process_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    data_list = []

    for reading_session in root.findall('.//{http://www.nih.gov}readingSession'):
        nodule_ids = []
        malignancy_scores = []

        for nodule_element in reading_session.findall('.//{http://www.nih.gov}unblindedReadNodule'):
            nodule_id_element = nodule_element.find('.//{http://www.nih.gov}noduleID')
            malignancy_element = nodule_element.find('.//{http://www.nih.gov}malignancy')

            if nodule_id_element is not None and malignancy_element is not None:
                nodule_id = nodule_id_element.text
                malignancy = malignancy_element.text
                nodule_ids.append(nodule_id)
                malignancy_scores.append(malignancy)

        if nodule_ids and malignancy_scores:
            series_instance_uid = root.find('.//{http://www.nih.gov}SeriesInstanceUid').text
            data_list.append({
                'SeriesInstanceUid': series_instance_uid,
                'Nodule ID': nodule_ids,
                'Malignancy': malignancy_scores
            })

    return data_list

input_folder = 'E:/4-1/BME 400/lung/LIDC data/LIDC-XML-only/tcia-lidc-xml'
output_excel = 'malignancy_data_with_uid.xlsx'

all_data_list = []

for folder_name in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder_name)
    if os.path.isdir(folder_path):
        for xml_file in os.listdir(folder_path):
            if xml_file.endswith('.xml'):
                xml_path = os.path.join(folder_path, xml_file)
                xml_data = process_xml(xml_path)
                all_data_list.extend(xml_data)

df_list = []
for data in all_data_list:
    num_nodules = len(data['Nodule ID'])
    series_instance_uids = [data['SeriesInstanceUid']] * num_nodules
    df_list.append(pd.DataFrame({
        'SeriesInstanceUid': series_instance_uids,
        'Nodule ID': data['Nodule ID'],
        'Malignancy': data['Malignancy']
    }))

final_df = pd.concat(df_list, ignore_index=True)

# Save DataFrame to an Excel file
final_df.to_excel(output_excel, index=False)

# final lidc csv

In [8]:
csv1_df = pd.read_csv('merged_data_with_seriesuid.csv')
csv2_df = pd.read_excel('malignancy_data_with_uid.xlsx')
#csv1_df=csv1_df[0:20]


In [9]:
csv4_data = []
for i in range(len(csv1_df['seriesuid'])):
    seriesID1 = csv1_df['seriesuid'][i]
    #print(csv1_df['SeriesUid'][i])
    nodule_id1 = csv1_df.iloc[i, 9:13].tolist()
    #print(nodule_id1)
    csv4_row = []
    subject_id = "LIDC-IDRI-{:04d}".format(csv1_df.iloc[i, 0])  # Format subject ID
    csv4_row.append(subject_id)
    csv4_row.append(seriesID1)
    csv4_row.append(csv1_df.iloc[i, 4])
    csv4_row.append(csv1_df.iloc[i, 5])
    csv4_row.append(csv1_df.iloc[i, 6])
    csv4_row.append(csv1_df.iloc[i, 7])
    
    for id1 in nodule_id1:
        id1_str = str(id1)
        
        matches = csv2_df[(csv2_df['SeriesInstanceUid'] == seriesID1) & (csv2_df['Nodule ID'] == id1_str)]
        
        if not matches.empty:
            csv4_row.append(matches.iloc[0]['Malignancy']) # Assuming the Malignancy value from the first match
        else:
            csv4_row.append('N/A')  # Handle the case when id1 is not found
    
    csv4_data.append(csv4_row)

    
csv4_df = pd.DataFrame(csv4_data, columns=['Subject ID','Series Uid','diameter','x loc','y loc','slice no','Radiologist1', 'Radiologist2', 'Radiologist3', 'Radiologist4'])

# Save csv3 DataFrame to a new csv file
csv4_df.to_csv('LIDC_malignancy.csv', index=False) 
            
    