# Setup

In [1]:
import sys

IN_COLAB = 'google.colab' in sys.modules
RUN_TRAINING_CELLS = IN_COLAB

EXPERIMENT_NAME = 'MedicalSummary/'
DRIVE_FOLDER_LOCATION = '/content/drive/My Drive/VivekaHackathon2024/' + EXPERIMENT_NAME

In [2]:
# Mounting google drive
if IN_COLAB:
    from google.colab import drive

    drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Environment Setup
## Setting up Google Drive as working directory and installing Packages

In [3]:
# Using my own Google Drive during the experiment to save all checkpoints and training logs.

if IN_COLAB:
    # Adapted from:  https://robertbrucecarter.com/writing/2020/06/setting-your-working-directory-to-google-drive-in-a-colab-notebook/
    import os

    def create_and_set_working_directory(path: str):
        # check if your project folder exists. if not, it will be created.
        if os.path.isdir(path) == False:
            os.makedirs(path)
            print(path + ' did not exist but was created.')

        # change the OS to use your project folder as the working directory
        os.chdir(path)

        print('Working directory changed to: \n' + path)

    create_and_set_working_directory(DRIVE_FOLDER_LOCATION)
    !pwd

Working directory changed to: 
/content/drive/My Drive/VivekaHackathon2024/MedicalSummary/
/content/drive/My Drive/VivekaHackathon2024/MedicalSummary


# Parsing the XML files to create suitable dataset

In [11]:
import xml.etree.ElementTree as ET

# This is an example function to extract relevant information from the PubMed Subset XML
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extracting relevant information based on the provided XML structure
    title_element = root.find('.//article-title')
    abstract_element = root.find('.//abstract')

    title = title_element.text if title_element is not None else None
    # Join all paragraphs in the abstract
    abstract = ' '.join([p.text for p in abstract_element.findall('p')]) if abstract_element is not None else None

    # Return as dictionary
    return {
        'title': title,
        'abstract': abstract
    }

# Example usage
file_path = '/content/drive/MyDrive/VivekaHackathon2024/PMC000xxxxxx/PMC176545.xml'
data = parse_xml(file_path)
print(data)


{'title': 'The Transcriptome of the Intraerythrocytic Developmental Cycle of ', 'abstract': '\n'}


# Full Script to parse all the XML files from the PubMed Subset


In [None]:
import xml.etree.ElementTree as ET
import os
import glob
import pandas as pd

def parse_xml(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")
        return None

    # Extracting relevant information based on the provided XML structure
    title_element = root.find('.//article-title')
    abstract_element = root.find('.//abstract')
    body_elements = root.findall('.//body//p')

    title = title_element.text if title_element is not None else None

    if abstract_element is not None:
        abstract_paragraphs = [p.text for p in abstract_element.findall('p') if p.text is not None]
        abstract = ' '.join(abstract_paragraphs)
    else:
        abstract = None

    if body_elements:
        body_paragraphs = [p.text for p in body_elements if p.text is not None]
        body = ' '.join(body_paragraphs)
    else:
        body = None

    # Check if title and abstract are found
    if not title or not abstract:
        print(f"Missing title or abstract in file: {file_path}")

    # Return as dictionary
    return {
        'title': title,
        'abstract': abstract,
        'body': body
    }

def parse_all_xmls(directory):
    all_data = []
    for file_path in glob.glob(os.path.join(directory, '*.xml')):
        data = parse_xml(file_path)
        if data:
            all_data.append(data)
    return all_data

# Directory containing XML files
directory = '/content/drive/MyDrive/VivekaHackathon2024/PMC000xxxxxx'
all_data = parse_all_xmls(directory)

# Convert to DataFrame and save
df = pd.DataFrame(all_data)
df.to_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries.csv', index=False)

print(df.head())


# Code for replacing complex terms into simple terms in the summaries by using Consumer Health Vocabulary (CHV) dataset.


In [17]:
import pandas as pd

# Load the CHV file
chv_path = '/content/drive/MyDrive/VivekaHackathon2024/CHV-amia14-data.tsv'
chv_df = pd.read_csv(chv_path, sep='\t', header=0)

# Create a dictionary for quick lookup
chv_dict = pd.Series(chv_df['CONSUMER'].values, index=chv_df['PROFESSIONAL']).to_dict()

# Display some entries to ensure it is loaded correctly
print(list(chv_dict.items())[:10])


[('nausea', 'Morning sickness'), ('eidetic memory', 'photographic memory'), ('aliment', 'Nutrition'), ('Lynch syndrome', 'hereditary nonpolyposis colorectal cancer (HNPCC)'), ('xerophthalmia', 'Keratoconjunctivitis sicca (KCS)'), ('Guinea pepper', 'spice'), ('vision training', 'Vision therapy'), ('primary ovarian insufficiency', 'Premature Ovarian Failure'), ('alligator pepper', 'spice'), ('GM2 gangliosidosis', 'Tay–Sachs disease')]


# Replacing Professional Terms with Consumer Terms in Abstract/Summaries

In [34]:
import pandas as pd
import re

# Load the CHV file
chv_path = '/content/drive/MyDrive/VivekaHackathon2024/CHV-amia14-data.tsv'
chv_df = pd.read_csv(chv_path, sep='\t', header=0)

# Create a dictionary for quick lookup
chv_dict = pd.Series(chv_df['CONSUMER'].values, index=chv_df['PROFESSIONAL']).to_dict()

def simplify_text(text, chv_dict):
    # Lowercase the text
    # text = text.lower()
    # Tokenize the text into words
    words = text.split()
    # Replace each word if it exists in the CHV dictionary
    simplified_words = [chv_dict.get(word.lower(), word) for word in words]
    # Join the words back into a single string
    simplified_text = ' '.join(simplified_words)
    return simplified_text

# Load the DataFrame with titles and abstracts
df = pd.read_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries.csv')

# Apply the simplification to each abstract
df['simplified_abstract'] = df['abstract'].apply(lambda x: simplify_text(str(x), chv_dict) if pd.notnull(x) else x)

# Count the number of abstracts that were changed
num_changed = df[df['abstract'] != df['simplified_abstract']].shape[0]

# Save the updated DataFrame to a new CSV file
df.to_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries_simplified.csv', index=False)

print(f"Number of abstracts changed: {num_changed}")
print(df[['abstract', 'simplified_abstract']].head())


Number of abstracts changed: 2454
  abstract simplified_abstract
0      NaN                 NaN
1      NaN                 NaN
2      NaN                 NaN
3      NaN                 NaN
4      NaN                 NaN


In [35]:
import pandas as pd

# Assuming df is your DataFrame containing 'abstract' and 'simplified_abstract' columns
# Load or create df as needed

# Drop rows where both 'abstract' and 'simplified_abstract' are NaN
df_clean = df.dropna(subset=['abstract', 'simplified_abstract'], how='all')

# Reset the index of the cleaned DataFrame
df_clean.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame to a new CSV file
df_clean.to_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries_clean.csv', index=False)

# Print the cleaned DataFrame
print(df_clean[['abstract', 'simplified_abstract']].head())


                                            abstract  \
0  Asthma is a common disease and appears to be i...   
1  While many lessons have been learned from the ...   
2  The use of spatially referenced data in cancer...   
3  There has long been a recognition that place m...   
4  EB-GIS4HEALTH UK aims at building a UK-oriente...   

                                 simplified_abstract  
0  Asthma is a common disease and appears to be i...  
1  While many lessons have been learned from the ...  
2  The use of spatially referenced data in cancer...  
3  There has long been a recognition that place m...  
4  EB-GIS4HEALTH UK aims at building a UK-oriente...  


In [36]:
import pandas as pd

# Assuming df_clean is your cleaned DataFrame
df_clean = pd.read_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries_clean.csv')  # Load the cleaned DataFrame

# Count the number of rows in the cleaned DataFrame
num_rows = df_clean.shape[0]

print(f"Number of rows with valid content: {num_rows}")


Number of rows with valid content: 757


# Preparation for Training

In [37]:
import pandas as pd

# Load the DataFrame (if not already loaded)
df = pd.read_csv('/content/drive/MyDrive/VivekaHackathon2024/medical_summaries_clean.csv')

# Define the input and target columns for T5 fine-tuning
df['input_text'] = 'summarize: ' + df['title'].fillna('') + ' ' + df['abstract'].fillna('') + ' ' + df['body'].fillna('')
df['target_text'] = df['abstract'].fillna('')  # Using abstract as the summary for fine-tuning

# Save the prepared dataset to a CSV file
df[['input_text', 'target_text']].to_csv('/content/drive/MyDrive/VivekaHackathon2024/t5_training_data.csv', index=False)

print(df[['input_text', 'target_text']].head())


                                          input_text  \
0  summarize: Road-traffic pollution and asthma –...   
1  summarize: Current practices in the spatial an...   
2  summarize: Current practices in spatial analys...   
3  summarize: Current practices in cancer spatial...   
4  summarize: Research protocol: EB-GIS4HEALTH UK...   

                                         target_text  
0  Asthma is a common disease and appears to be i...  
1  While many lessons have been learned from the ...  
2  The use of spatially referenced data in cancer...  
3  There has long been a recognition that place m...  
4  EB-GIS4HEALTH UK aims at building a UK-oriente...  
