# Validation of NBMEDataProcessor Output

This notebook will execute the NBMEDataProcessor step by step and verify its outputs. Make sure the `preprocess.py` file defines the `NBMEDataProcessor` class.

In [1]:
# 1. Set data directory and output directory
data_dir = r"C:\Users\SIMON\Desktop\NLP\nbme-score-clinical-patient-notes"
output_dir = r"C:\Users\SIMON\Desktop\NLP\processed"

In [2]:
# 2. Import processor and initialize
from preprocess import NBMEDataProcessor
processor = NBMEDataProcessor(data_dir=data_dir, output_dir=output_dir)

In [3]:
# 3. Load data and check row counts
processor.load_data()

Loading data files...
Loaded train data with 14300 rows
Loaded patient notes with 42146 rows
Loaded features with 143 rows
Loaded test data with 5 rows
Identifying feature types...
Identified 7 female-related features
Identified 11 male-related features
Identified 1 age-related features


In [4]:
# 4. Verify feature type identification
processor.identify_feature_types()
print('Number of female-related features:', len(processor.feature_female))
print('Number of male-related features:', len(processor.feature_male))
print('Number of age-related features:', len(processor.feature_year))

Identifying feature types...
Identified 7 female-related features
Identified 11 male-related features
Identified 1 age-related features
Number of female-related features: 7
Number of male-related features: 11
Number of age-related features: 1


In [5]:
# 5. Feature preprocessing example
features = processor.preprocess_features()
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [6]:
# 6. Parse annotations and display first rows
train_parsed = processor.parse_annotations()
train_parsed.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_length
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],1
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],1
2,00016_002,0,16,2,[chest pressure],[203 217],1
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",2
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],1


In [7]:
# 7. Merge data and display first rows
train_merged = processor.merge_data()
train_merged.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_length,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],1,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],1,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",2,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],1,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [8]:
# 8. Check annotation integrity
checked_data = processor.check_annotation_integrity()

Checking annotation integrity...


In [9]:
# 9. Standardize medical text example
train_standardized = processor.standardize_medical_text()
train_standardized[['pn_history', 'feature_text']].head()

Standardizing medical text...
Medical text standardization completed


Unnamed: 0,pn_history,feature_text
0,HPI: 17yo M presents with palpitations. Patien...,Familynegativehistorynegativeofnegativemyocard...
1,HPI: 17yo M presents with palpitations. Patien...,Familynegativehistorynegativeofnegativethyroid...
2,HPI: 17yo M presents with palpitations. Patien...,Chestnegativepressure
3,HPI: 17yo M presents with palpitations. Patien...,Intermittentnegativesymptoms
4,HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [10]:
# 10. Correct offsets and display first rows
train_corrected = processor.correct_offsets()
train_corrected.head()

Correcting annotation offsets...
Offset correction completed


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_length,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[[710 738]],1,Familynegativehistorynegativeofnegativemyocard...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[[682 707]],1,Familynegativehistorynegativeofnegativethyroid...,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[[210 224]],1,Chestnegativepressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[[77 98], [90 97]]",2,Intermittentnegativesymptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[[229 265]],1,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [11]:
# 11. Process spaces and display first rows
train_spaced = processor.process_spaces()
train_spaced.head()

Processing spaces in text data...
Space processing completed


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_length,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[[710 738]],1,Familynegativehistorynegativeofnegativemyocard...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[[682 707]],1,Familynegativehistorynegativeofnegativethyroid...,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[[210 224]],1,Chestnegativepressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]",[[77 98]],2,Intermittentnegativesymptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[[229 265]],1,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [12]:
# 12. Create cross-validation folds and show distribution
train_folds = processor.create_folds()
print(train_folds['fold'].value_counts())

Creating 5 folds using GroupKFold...
Created 5 folds
4    2860
2    2860
1    2860
0    2860
3    2860
Name: fold, dtype: int64


In [13]:
# 13. Split into train/test sets if needed
result = processor.create_train_test_split()
if result is not None:
    train_final, test_final = result
else:
    train_final = getattr(processor, 'train_final', None)
    test_final = processor.test

print('Train shape:', None if train_final is None else train_final.shape)
print('Test shape :', None if test_final is None else test_final.shape)

Test data already provided, skipping split.
Train shape: None
Test shape : (5, 4)


In [14]:
# 14. Save processed data and list output files
processor.save_processed_data()
import os
print(os.listdir(output_dir))

Saving processed data...
Saved processed train data to C:\Users\SIMON\Desktop\NLP\processed\train_processed.csv
Saved processed test data to C:\Users\SIMON\Desktop\NLP\processed\test_processed.csv
Saved labels to C:\Users\SIMON\Desktop\NLP\processed\train_labels.npy
Saved fold information to C:\Users\SIMON\Desktop\NLP\processed\folds.npy
['folds.npy', 'test_processed.csv', 'train_labels.npy', 'train_processed.csv']


In [15]:
# 15. Load and inspect final processed data
import pandas as pd
proc_train = pd.read_csv(os.path.join(output_dir, 'train_processed.csv'))
proc_test = pd.read_csv(os.path.join(output_dir, 'test_processed.csv'))
proc_train.head(), proc_test.head()

(          id  case_num  pn_num  feature_num  \
 0  00016_000         0      16            0   
 1  00016_001         0      16            1   
 2  00016_002         0      16            2   
 3  00016_003         0      16            3   
 4  00016_004         0      16            4   
 
                                  annotation       location  annotation_length  \
 0          ['dad with recent heart attcak']  [['710 738']]                  1   
 1             ['mom with "thyroid disease']  [['682 707']]                  1   
 2                        ['chest pressure']  [['210 224']]                  1   
 3      ['intermittent episodes', 'episode']    [['77 98']]                  2   
 4  ['felt as if he were going to pass out']  [['229 265']]                  1   
 
                                         feature_text  \
 0  Familynegativehistorynegativeofnegativemyocard...   
 1  Familynegativehistorynegativeofnegativethyroid...   
 2                              Chestnegative

In [16]:
# 16. Brief statistical summary
print('Annotation length summary:')
print(proc_train['annotation_length'].describe())
print('\nFold distribution:')
print(proc_train['fold'].value_counts())

Annotation length summary:
count    14300.000000
mean         0.855524
std          0.756237
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          8.000000
Name: annotation_length, dtype: float64

Fold distribution:
4    2860
2    2860
1    2860
0    2860
3    2860
Name: fold, dtype: int64


In [17]:
# 17. Offset Correction Comparison Example
# Select a sample index (adjust as needed)
sample_idx = 0

# Display original and corrected locations and text snippet
print(f"Sample index: {sample_idx}")
print("Text snippet:")
print(train_standardized.loc[sample_idx, 'pn_history'])
print("\nOriginal locations:", train_merged.loc[sample_idx, 'location'])
print("Corrected locations:", train_corrected.loc[sample_idx, 'location'])

Sample index: 0
Text snippet:
HPI: 17yo M presents with palpitations. Patient reports 3negative4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1negative3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. 
PMHx: none
Rx: uses friends adderrall
FHx: mom with "thyroid disease," dad with recent heart attcak
All: none
Immunizations: up to date
SHx: Freshmen in college. Endorses 3negative4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms

Original locati

In [18]:
# 18. Display original and corrected span texts for a sample record
# Choose a sample index to inspect (modify as needed)
sample_idx = 0

# Retrieve text fields
orig_text = train_merged.loc[sample_idx, 'pn_history']
corr_text = train_corrected.loc[sample_idx, 'pn_history']

# Retrieve location lists
orig_locs = train_merged.loc[sample_idx, 'location']
corr_locs = train_corrected.loc[sample_idx, 'location']

# Function to extract spans and corresponding text
def extract_spans(text, locs):
    spans = []
    for loc_list in locs:
        items = loc_list if isinstance(loc_list, list) else [loc_list]
        for loc in items:
            parts = loc.split(';')
            for part in parts:
                start, end = map(int, part.split())
                spans.append((start, end, text[start:end]))
    return spans

# Extract and print original spans
print("Original annotation spans and their text:")
for start, end, span_text in extract_spans(orig_text, orig_locs):
    print(f"  {start}-{end}: '{span_text}'")

print("\nCorrected annotation spans and their text:")
for start, end, span_text in extract_spans(corr_text, corr_locs):
    print(f"  {start}-{end}: '{span_text}'")

Original annotation spans and their text:
  696-724: 'dad with recent heart attcak'

Corrected annotation spans and their text:
  710-738: 'dad with recent heart attcak'
