In [21]:
import os
import xml.etree.ElementTree as ET
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from docx import Document
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from lxml import etree
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.text import WD_BREAK

[nltk_data] Downloading package punkt to D:\anaconda3\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     D:\anaconda3\lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def extract_data(xml_file):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    data = {
        'brief_summary': 'NA',
        'detailed_description': 'NA',
        'brief_title': 'NA',
        'conditions': [],
        'eligibility_criteria': 'NA',
        'eligibility_gender': 'NA',
        'eligibility_maximum_age': 'NA',
        'eligibility_minimum_age': 'NA',
        'id_info_nct_id': 'NA',
        'id_info_org_study_id': 'NA',
        'location_countries': [],
        'location_contact_email': 'NA',
        'location_contact_last_name': 'NA',
        'primary_outcome_description': 'NA',
        'primary_outcome_measure': 'NA',
        'primary_outcome_time_frame': 'NA',
        'provided_document_url': 'NA',
        'secondary_outcome_description': 'NA',
        'secondary_outcome_measure': 'NA',
        'secondary_outcome_time_frame': 'NA',
    }

    for elem in root.findall('./brief_summary/textblock'):
        data['brief_summary'] = elem.text.strip()
    
    for elem in root.findall('./detailed_description/textblock'):
        detailed_description = elem.text.strip()
        summary = extractive_summarization(detailed_description)
        data['detailed_description'] = summary

    for elem in root.findall('./brief_title'):
        data['brief_title'] = elem.text.strip()

    for elem in root.findall('./condition_browse/mesh_term'):
        data['conditions'].append(elem.text.strip())

    for elem in root.findall('./eligibility/criteria/textblock'):
        eligibility_criteria = elem.text.strip()
        summary1 = extractive_summarization(eligibility_criteria)
        words = summary1.split()
        summary2 = "".join(summary1)
        data['eligibility_criteria'] = summary2

    for elem in root.findall('./eligibility/gender'):
        data['eligibility_gender'] = elem.text.strip()

    for elem in root.findall('./eligibility/maximum_age'):
        data['eligibility_maximum_age'] = elem.text.strip()

    for elem in root.findall('./eligibility/minimum_age'):
        data['eligibility_minimum_age'] = elem.text.strip()

    for elem in root.findall('./id_info/nct_id'):
        data['id_info_nct_id'] = elem.text.strip()

    for elem in root.findall('./id_info/org_study_id'):
        data['id_info_org_study_id'] = elem.text.strip()

    for elem in root.findall('./location_countries/country'):
        data['location_countries'].append(elem.text.strip())

    for elem in root.findall('./location/contact/email'):
        data['location_contact_email'] = elem.text.strip()

    for elem in root.findall('./location/contact/last_name'):
        data['location_contact_last_name'] = elem.text.strip()

    for elem in root.findall('./primary_outcome/description'):
        data['primary_outcome_description'] = elem.text.strip()

    for elem in root.findall('./primary_outcome/measure'):
        data['primary_outcome_measure'] = elem.text.strip()

    for elem in root.findall('./primary_outcome/time_frame'):
        data['primary_outcome_time_frame'] = elem.text.strip()

    for elem in root.findall('./provided_document_section/provided_document/document_url'):
        data['provided_document_url'] = elem.text.strip()

    for elem in root.findall('./secondary_outcome/description'):
        data['secondary_outcome_description'] = elem.text.strip()

    for elem in root.findall('./secondary_outcome/measure'):
        data['secondary_outcome_measure'] = elem.text.strip()

    for elem in root.findall('./secondary_outcome/time_frame'):
        data['secondary_outcome_time_frame'] = elem.text.strip()

    return data

In [23]:
"""
"""
def extractive_summarization(text, num_sentences=1):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words and word.isalnum()]
    
    freq = FreqDist(words)
    ranking = {}
    for i, sentence in enumerate(sentences):
        ranking[i] = 0
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                ranking[i] += freq[word]

    top_sentences = nlargest(num_sentences, ranking, key=ranking.get)
    summary = ' '.join(sentences[i] for i in sorted(top_sentences))
    return summary

In [24]:
def print_extracted_data(data):
    print("Brief Summary:", data['brief_summary'])
    print("Detailed Description:", data['detailed_description'])
    print("Brief Title:", data['brief_title'])
    print("Conditions:", ', '.join(data['conditions']))
    print("Eligibility Criteria:", data['eligibility_criteria'])
    print("Eligibility Gender:", data['eligibility_gender'])
    print("Eligibility Maximum Age:", data['eligibility_maximum_age'])
    print("Eligibility Minimum Age:", data['eligibility_minimum_age'])
    print("ID Info NCT ID:", data['id_info_nct_id'])
    print("ID Info Org Study ID:", data['id_info_org_study_id'])
    print("Location Countries:", ', '.join(data['location_countries']))
    print("Location Contact Email:", data['location_contact_email'])
    print("Location Contact Last Name:", data['location_contact_last_name'])
    print("Primary Outcome Description:", data['primary_outcome_description'])
    print("Primary Outcome Measure:", data['primary_outcome_measure'])
    print("Primary Outcome Time Frame:", data['primary_outcome_time_frame'])
    print("Provided Document URL:", data['provided_document_url'])
    print("Secondary Outcome Description:", data['secondary_outcome_description'])
    print("Secondary Outcome Measure:", data['secondary_outcome_measure'])
    print("Secondary Outcome Time Frame:", data['secondary_outcome_time_frame'])

In [36]:
def save_data_to_word(doc, data, file_name):
    doc.add_heading(file_name, level=1)
    for key, value in data.items():
        doc.add_heading(key.replace('_', ' ').title(), level=2)
        doc.add_paragraph(value)

In [42]:
def set_style(document, style_name, font_name, font_size):
    style = document.styles[style_name]
    font = style.font
    font.name = font_name
    font.size = Pt(font_size)

In [50]:
def remove_empty_lines_and_newlines(doc):
    new_doc = Document()
    set_style(new_doc, 'Normal', 'Calibri', 12)

    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        if text != '':
            # remove more space
            cleaned_text = ' '.join(text.split())
            new_p = new_doc.add_paragraph(cleaned_text)
            new_p.style = paragraph.style

    return new_doc

In [51]:
for yyyy in range(1000):
    folder_name = f'NCT{yyyy:04d}xxxx'
    if not os.path.exists(folder_name):
        continue
        
    doc = Document()
    set_style(doc, 'Normal', 'Calibri', 12)
    for xxxx in range(10000):
        file_name = f'NCT{yyyy:04d}{xxxx:04d}.xml'
        file_path = os.path.join(folder_name, file_name)
        if not os.path.exists(file_path):
            continue
            
        data = extract_data(file_path)
        save_data_to_word(doc, data, file_name)
    
    doc = remove_empty_lines_and_newlines(doc)
    output_file_name = f'{folder_name}.docx'
    doc.save(output_file_name)
    print(f'Saved data for folder {folder_name} to {output_file_name}')

Saved data for folder NCT0000xxxx to NCT0000xxxx.docx
Saved data for folder NCT0001xxxx to NCT0001xxxx.docx
