In [41]:
import numpy as np
import pandas as pd
import re

In [29]:
# EXTRACT RELEVANT INFORMATION
def preprocess_text(df):
    # This function preprocesses the text by filling not a number
    # and replacing new lines ('\n') and carriage returns ('\r')
    df.text = df.text.fillna(' ')
    df.text = df.text.str.replace('\n', ' ')
    df.text = df.text.str.replace('\r', ' ')
    return df


def extract_section(entry, left='History of Present Illness:', right='Past Medical History:'):
    text = str(entry)
    try:
        # Find the section between left and right markers
        section = text[text.index(left) + len(left):text.index(right)]
    except ValueError:
        # If no section is found, return an empty string
        section = ""
        # Optionally, print a message indicating no section was detected
        # print('no section detected!')
    return section


def extract_sex(entry, left='Sex:   '):
    text = str(entry)
    try:
        # Extract the sex information
        sex = text[text.index(left) + len(left):text.index(left) + len(left) + 1]
    except ValueError:
        # If no sex information is found, return an empty string
        sex = ""
        # Optionally, print a message indicating no sex was detected
        # print('no sex detected!')
    return sex


# Function to replace all occurrences of patterns in a text according to a dictionary
def replace_all(text, dic):
    for i, j in dic.items():
        # Iterate through the dictionary and replace all occurrences of keys with their corresponding values
        text = re.sub(i, j, str(text).lower())
    return text

# Function to replace gender-specific pronouns and terms with gender-neutral alternatives
def replace_gender(text):
    return replace_all(text, gender_neutral_dict)

In [30]:
discharge = pd.read_csv('./raw_data/mimic-iv-note-deidentified-free-text-clinical-notes-2.2/note/discharge.csv.gz', compression="gzip")

In [31]:
discharge = preprocess_text(discharge)

In [4]:
discharge_detail = pd.read_csv('./raw_data/mimic-iv-note-deidentified-free-text-clinical-notes-2.2/note/discharge_detail.csv.gz', compression="gzip")

In [32]:
notes = discharge['text']

In [34]:
sex_df = notes.apply(extract_sex)
section_df = notes.apply(extract_section)
chef_complaint_df = notes.apply(extract_section, left='Chief Complaint:', right='History of Present Illness:')

In [35]:
gender_neutral_dict = {
    r'\bhe\b': 'they',
    r'\bshe\b': 'they',
    r'\bhim\b': 'them',
    r'\bher\b': 'their',   ##### review!~
    r'\bhis\b': 'their',
    r'\bhers\b': 'theirs',
    r'\bhimself\b': 'themself',
    r'\bherself\b': 'themself',
    r'\bmr\b': '',
    r'\bmrs\b': '',
    r'\bms\b': '',
    r'\bman\b': 'person',
    r'\bwoman\b': 'person',
    r'\bmale\b': 'person',
    r'\bfemale\b': 'person'
}

In [36]:
df = pd.DataFrame(list(zip(section_df, sex_df)))
df.to_csv('sections.csv')
df = pd.DataFrame(list(zip(chef_complaint_df, sex_df)))
df.to_csv('chief_complaint.csv')

In [42]:
sections = pd.read_csv("sections.csv")
sections = sections[['0', '1']]
sections['0'] = sections['0'].apply(replace_gender)
sections.to_csv('sections_processed.csv', index=False)

In [43]:
file = pd.read_csv('sections_processed.csv')
filtered_file = file[file['1'].isin(['M', 'F'])]
filtered_file.to_csv('sections_processed_filtered.csv', index=False)

In [44]:
sections = pd.read_csv("chief_complaint.csv")
sections = sections[['0', '1']]
sections['0'] = sections['0'].apply(replace_gender)
sections.to_csv('chief_complaint_processed.csv', index=False)

In [45]:
file = pd.read_csv('chief_complaint_processed.csv')
filtered_file = file[file['1'].isin(['M', 'F'])]
filtered_file.to_csv('chief_complaint_processed_filtered.csv', index=False)