In [1]:

import pandas as pd
import json

#loading the data
train = pd.read_csv("/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/train.csv")
test = pd.read_csv("/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/test.csv")

**Checking out what an example note and JSON looks like**

In [2]:
print(train["Note"][0])

**Clinical Notes**

**Patient Information:**
- Age: 41 years old
- Gender: Male

**Visit Motivation:** Anemia evaluation

**Chief Complaints and Symptoms:**
The patient presents with complaints of anemia, fever, fatigue, difficulty breathing (dyspnea), vomiting, dizziness, blurred vision, wheezing, and pale skin. The patient reports that these symptoms have been progressively worsening over the past few weeks.

**History of Present Illness:**
- **Fever**: Intermittent low-grade fevers for approximately 2 weeks.
- **Fatigue**: Persistent and severe fatigue affecting daily activities.
- **Difficulty Breathing (Dyspnea)**: Experiencing shortness of breath, particularly with exertion. No history of chronic respiratory conditions.
- **Vomiting**: Occasional episodes of non-bilious vomiting for the past week.
- **Dizziness**: Episodes of dizziness, especially upon standing, which can be severe enough to cause near-fainting spells.
- **Blurred Vision**: New onset of blurred vision, with occas

In [3]:
json.loads(train["json"][0])

{'patient_info': {'age': 41, 'gender': 'Male'},
 'visit_motivation': 'Anemia',
 'symptoms': ['fever',
  'fatigue',
  'difficulty_breathing',
  'vomiting',
  'dizziness',
  'blurred_vision',
  'wheezing',
  'pale_skin'],
 'vital_signs': {'heart_rate': {'value': 114, 'unit': 'bpm'},
  'oxygen_saturation': {'value': 98.4, 'unit': '%'},
  'cholesterol_level': {'value': 132.8, 'unit': 'mg/dL'},
  'glucose_level': {'value': 110.6, 'unit': 'mg/dL'}}}

In [4]:
# Parse the JSON column
extracted_json = train['json'].apply(json.loads)

# Create new columns from the parsed JSON
json_fields_df = pd.json_normalize(extracted_json)

# Concatenate the original DataFrame with the new fields
expanded_train = pd.concat([train, json_fields_df], axis=1)

In [5]:
#Resulting normalized columns for analysis
print(expanded_train.columns)

Index(['ID', 'Note', 'json', 'visit_motivation', 'symptoms',
       'patient_info.age', 'patient_info.gender',
       'vital_signs.heart_rate.value', 'vital_signs.heart_rate.unit',
       'vital_signs.oxygen_saturation.value',
       'vital_signs.oxygen_saturation.unit',
       'vital_signs.cholesterol_level.value',
       'vital_signs.cholesterol_level.unit', 'vital_signs.glucose_level.value',
       'vital_signs.glucose_level.unit', 'vital_signs.temperature.value',
       'vital_signs.temperature.unit', 'vital_signs.respiratory_rate.value',
       'vital_signs.respiratory_rate.unit',
       'vital_signs.blood_pressure.systolic.value',
       'vital_signs.blood_pressure.systolic.unit',
       'vital_signs.blood_pressure.diastolic.value',
       'vital_signs.blood_pressure.diastolic.unit'],
      dtype='object')


<h2>Let's build a baseline that averages the numeric values and uses the most common categorical values</h2>
We will pack this into a constant json response and then measure how it does on the training set

In [6]:
from collections import Counter

# Define dictionaries to store the most common or average responses
common_values = {}
average_values = {}

# Calculate most common values for categorical fields
categorical_columns = ['visit_motivation', 'patient_info.gender', 'symptoms']
for column in categorical_columns:
    if column == 'symptoms':
        # Flatten the lists of symptoms and find the most common 10 symptoms
        symptoms_list = expanded_train['symptoms'].explode()
        common_values[column] = symptoms_list.value_counts().nlargest(10).index.tolist()
    else:
        # For single-value columns, find the most common value
        common_values[column] = expanded_train[column].mode()[0]

# Calculate average values for numeric fields
numeric_columns = [
    'patient_info.age', 
    'vital_signs.heart_rate.value', 
    'vital_signs.oxygen_saturation.value',
    'vital_signs.cholesterol_level.value', 
    'vital_signs.glucose_level.value'
]
for column in numeric_columns:
    average_values[column] = expanded_train[column].mean()

# Combine results into a JSON-like dictionary
dummy_submission = {
    "patient_info": {
        "age": average_values['patient_info.age'],
        "gender": common_values['patient_info.gender']
    },
    "visit_motivation": common_values['visit_motivation'],
    "symptoms": common_values['symptoms'],
    "vital_signs": {
        "heart_rate": {
            "value": average_values['vital_signs.heart_rate.value'],
            "unit": "bpm"
        },
        "oxygen_saturation": {
            "value": average_values['vital_signs.oxygen_saturation.value'],
            "unit": "%"
        },
        "cholesterol_level": {
            "value": average_values['vital_signs.cholesterol_level.value'],
            "unit": "mg/dL"
        },
        "glucose_level": {
            "value": average_values['vital_signs.glucose_level.value'],
            "unit": "mg/dL"
        }
    }
}

# Convert the dummy submission to JSON format for submission
dummy_submission_json = json.dumps(dummy_submission)
print(dummy_submission_json)


{"patient_info": {"age": 48.24995379781926, "gender": "Female"}, "visit_motivation": "Urinary Tract Infection (UTI)", "symptoms": ["difficulty_breathing", "fatigue", "cough", "fever", "chest_pain", "headache", "sore_throat", "runny_nose", "rash", "frequent_urination"], "vital_signs": {"heart_rate": {"value": 86.0034620045006, "unit": "bpm"}, "oxygen_saturation": {"value": 96.3006446991404, "unit": "%"}, "cholesterol_level": {"value": 161.64985920780927, "unit": "mg/dL"}, "glucose_level": {"value": 114.70387453874538, "unit": "mg/dL"}}}


In [7]:
from metric import score

In [8]:
predictions = train.copy()
predictions["json"] = dummy_submission_json

## Measuring the performance
We can measure how the model does using the provided scoring function. This is a model free baseline just returning constant responses based on some aggregations

In [9]:
score(train[["ID", "json"]], predictions[["ID", "json"]], "ID")

0.4629849427194722

In [10]:
sample_submission = pd.read_csv("/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/sample_submission.csv")

In [11]:
sample_submission["json"] = dummy_submission_json

In [12]:
sample_submission.to_csv("submission.csv", index = False)