In [1]:
import os
import json
import pandas as pd
from datetime import datetime

# Directory containing the JSON files
directory = './Datasets/'



In [2]:
data = pd.read_json('./Datasets/000e7fac-5ea9-404d-8434-1229c16c29b0.json')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          75 non-null     object
 1   entry         75 non-null     object
 2   resourceType  75 non-null     object
dtypes: object(3)
memory usage: 1.9+ KB


In [4]:
data['entry'][1]

{'fullUrl': 'urn:uuid:a3e3bd43-ef3c-48fd-9851-5302d1aa84c6',
 'resource': {'id': 'a3e3bd43-ef3c-48fd-9851-5302d1aa84c6',
  'status': 'finished',
  'class': {'code': 'ambulatory'},
  'type': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '185345009'}],
    'text': 'Encounter for symptom'}],
  'patient': {'reference': 'urn:uuid:65d12976-9588-4cfa-a795-216302a2ece9'},
  'period': {'start': '2010-07-31T13:21:25-04:00',
   'end': '2010-07-31T13:21:25-04:00'},
  'reason': {'coding': [{'system': 'http://snomed.info/sct',
     'code': '43878008',
     'display': 'Streptococcal sore throat (disorder)'}]},
  'resourceType': 'Encounter'}}

In [5]:
pd.json_normalize(data['entry'][0])

Unnamed: 0,fullUrl,resource.id,resource.text.status,resource.text.div,resource.extension,resource.identifier,resource.name,resource.telecom,resource.gender,resource.birthDate,resource.address,resource.multipleBirthBoolean,resource.photo,resource.resourceType
0,urn:uuid:65d12976-9588-4cfa-a795-216302a2ece9,65d12976-9588-4cfa-a795-216302a2ece9,generated,"<div>Generated by <a href=""https://github.com/...",[{'url': 'http://hl7.org/fhir/StructureDefinit...,[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Stracke702', '...",[{'extension': [{'url': 'http://standardhealth...,female,1997-06-23,[{'extension': [{'extension': [{'url': 'latitu...,False,"[{'contentType': 'image/png', 'data': 'iVBORw0...",Patient


In [12]:


# List to hold all patient details
patients = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        
        # Open and load the JSON file
        with open(filepath, 'r') as file:
            data = json.load(file)
            
            # Extract patient details from the current JSON file
            for entry in data.get('entry', []):
                # print(entry['re'])
                resource = entry.get('resource', {})
                name_info = resource.get('name', [{}])[0]
                address_info = resource.get('address', [{}])[0]
                marital_status_info = resource.get('maritalStatus', {}).get('coding', [{}])[0]
                            # Extract prefix information
                prefix_info = resource.get('prefix', [{}])[0]
                prefix = prefix_info.get('prefix', '')

                # Determine appropriate prefix based on gender
                gender = resource.get('gender', '').lower()
                if gender == 'male':
                    prefix = "Mr."
                else:
                    prefix = "Mrs."
                def remove_numbers_from_string(s):
                    return ''.join([char for char in s if not char.isdigit()])

                given_names = ' '.join([remove_numbers_from_string(name) for name in name_info.get('given', [])])
                family_name = remove_numbers_from_string(name_info.get('family', ''))
                ethnicity = ""
                race = ""

                for e in resource.get('extension', []):
                    if e.get("valueCodeableConcept"):
                        if e["valueCodeableConcept"].get("text") == "race":
                            race = e["valueCodeableConcept"]["coding"][0].get("display", "")
                        elif e["valueCodeableConcept"].get("text") == "ethnicity":
                            ethnicity = e["valueCodeableConcept"]["coding"][0].get("display", "")
                phone_number = ""
                for telecom in resource.get('telecom', []):
                    if telecom.get('system') == 'phone' and telecom.get('use') == 'home':
                        phone_number = telecom.get('value', '')
                    break
                marital_code = resource.get("maritalStatus", {}).get("coding", [{}])[0].get("code", "")
                if marital_code == "S":
                    marital_status = "Single"
                elif marital_code == "M":
                    marital_status = "Married"
                else:
                    marital_status = "Unknown"
                
                # Concatenate prefix, given names, and family name
                full_name = prefix + ' ' + given_names + ' ' + family_name

                # Create patient dictionary
                patient = {
                    'Patient ID': resource.get('id', ''),
                    'name': full_name,
                    'gender': resource.get('gender', ''),
                    'birth_date': resource.get('birthDate', ''),
                    'marital_status': marital_status,
                    'city': address_info.get('city', ''),
                    'state': address_info.get('state', ''),
                    'race': race,
                    'ethnicity': ethnicity,
                    'Phone_Number':phone_number,
                }

                patients.append(patient)

# Create a DataFrame from the list of patient details
df = pd.DataFrame(patients, columns=['Patient ID','name', 'gender', 'birth_date','Phone_Number','marital_status','race','ethnicity', 'city', 'state'])

# Correct the address concatenation
df['address'] = df['city'] + ', ' + df['state']

# # Add patient ID
# df['Patient ID'] = 
# df["Race"]=race
# Calculate age
def calculate_age(birth_date):
    if birth_date:
        birth_date = datetime.strptime(birth_date, '%Y-%m-%d')
        today = datetime.today()
        age = today.year - birth_date.year 
        return str(age)+' yrs'
    else:
        return None


df['age'] = df['birth_date'].apply(calculate_age)

# Rearrange columns to match the desired order
df = df[['Patient ID', 'name', 'gender', 'age','Phone_Number','race','ethnicity','address', 'marital_status']]

# Convert column names to uppercase and show only the first letter of each word
df.columns = [col.upper()[0] + col[1:].lower() for col in df.columns]


# Save the DataFrame to a CSV file
csv_file = 'patients_data.csv'
df.to_csv(csv_file, index=False)

print(f'CSV file created: {csv_file}')


CSV file created: patients_data.csv


In [13]:
df=pd.read_csv("patients_data.csv")
df
len(df)

2446

In [14]:
df=pd.read_csv("patients_data.csv")
# print(df)
cleaned_data = df.dropna(subset=[col for col in df.columns if col != 'phone_number'])
# Reset the index of the DataFrame
cleaned_data.reset_index(drop=True, inplace=True)

cleaned_data


Unnamed: 0,Patient id,Name,Gender,Age,Phone_number,Race,Ethnicity,Address,Marital_status
0,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,Mrs. Zachary Willms,female,65 yrs,493.648.2471 x26272,White,Nonhispanic,"Shrewsbury, MA",Married
1,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,Mrs. Sunny Pouros,female,71 yrs,(608) 524-4834,White,Nonhispanic,"Andover, MA",Single
2,c71d235f-9b58-4106-b744-c554da5e13fa,Mr. Sylvia Reilly,male,86 yrs,742.429.3479,White,Nonhispanic,"Malden, MA",Married
3,509474a0-baf3-48f2-a595-67b6e2b34c87,Mr. Tess DuBuque,male,33 yrs,1-569-036-1912 x83432,Other,Puerto_rican,"Boston, MA",Unknown
4,f10fd607-58ce-4be6-b1bd-a98898c8e0e5,Mrs. Victor Kuphal,female,89 yrs,(509) 809-9798 x03337,White,Nonhispanic,"Peabody, MA",Married
5,65dfe191-c7ef-4148-8f12-49f832525d4c,Mrs. Destinee Welch,female,11 yrs,(984) 384-1363 x31838,White,Nonhispanic,"Holliston, MA",Unknown
6,3a268300-21fb-4ef0-b794-02aa0e5a7e5f,Mrs. Robbie Kling,female,95 yrs,1-309-534-9052,White,Nonhispanic,"Leicester, MA",Married
7,1694897b-912e-401f-a135-aa59857a401e,Mr. Julien Barrows,male,17 yrs,653.189.9071 x849,White,Nonhispanic,"Hudson, MA",Unknown
8,117335bb-33ff-4897-ac27-af4b3f11135f,Mr. Bart Schaden,male,75 yrs,375-025-8548 x42903,White,Nonhispanic,"Worcester, MA",Married
9,a0dcd747-1d40-40fd-8e42-1f9395f034ca,Mrs. Evan Klocko,female,80 yrs,(386) 001-8200 x5002,White,Nonhispanic,"Lynn, MA",Single


In [9]:
len(cleaned_data)

39

In [16]:
# Define the name of the Excel file to save the cleaned data
cleaned_excel_file = 'patient_Details_formated.xlsx'

# Save the cleaned data to an Excel file
cleaned_data.to_excel(cleaned_excel_file, index=False)

# Print a confirmation message
print(f'Cleaned Excel file created: {cleaned_excel_file}')


Cleaned Excel file created: patient_Details_formated.xlsx
