In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

from src import preprocess

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
CLINICAL_NOTES_FILE = DATA_DIR + "ClinNotes.csv"
MEDICAL_CONCEPTS_FILE = DATA_DIR + "MedicalConcepts.csv"

PROCESSDED_DATA_DIR = './processed_data/'
PROCESSED_CLINICAL_NOTES_FILE = PROCESSDED_DATA_DIR + "ClinNotes.csv"
NORMALIZED_CLINICAL_NOTES_FILE = PROCESSDED_DATA_DIR + "ClinNotes_normalized.csv"
EXTENDED_CLINICAL_NOTES_FILE = PROCESSDED_DATA_DIR + "ClinNotes_normalized_extended.csv"
TFIDF_VECTOR_FILE = PROCESSDED_DATA_DIR + 'Tfidf_vector.npy'
TFIDF_VOCAB_FILE = PROCESSDED_DATA_DIR + 'Tfidf_vocab.npy'
TFIDF_EXTENDED_VECTOR_FILE = PROCESSDED_DATA_DIR + 'Tfidf_extended_vector.npy'
TFIDF_EXTENDED_VOCAB_FILE = PROCESSDED_DATA_DIR + 'Tfidf_extended_vocab.npy'

# TF-IDF Vectorization

In this notebook, we will use TF-IDF to vectorize our clinical notes. It is a simple but good baseline when it comes to document vectorization. Due to the insensitivity to word sequence, we can also integrate the related medical terms into the document vector. Below we will vectorize the normalized clinical notes and notes with extended related medical terms, and save the vectors and the vocabulary to files.

In [4]:
df_clinical_normalized = pd.read_csv(NORMALIZED_CLINICAL_NOTES_FILE)
df_clinical_extended = pd.read_csv(EXTENDED_CLINICAL_NOTES_FILE)

In [5]:
vectors, vocab = preprocess.tfidf_vectorize(df_clinical_normalized['notes'])

In [6]:
Path(PROCESSDED_DATA_DIR).mkdir(parents=True, exist_ok=True)
np.save(TFIDF_VECTOR_FILE, vectors.A)
np.save(TFIDF_VOCAB_FILE, vocab)

In [7]:
vectors_extended, vocab_extended = preprocess.tfidf_vectorize(df_clinical_extended['notes'])

In [8]:
Path(PROCESSDED_DATA_DIR).mkdir(parents=True, exist_ok=True)
np.save(TFIDF_EXTENDED_VECTOR_FILE, vectors_extended.A)
np.save(TFIDF_EXTENDED_VOCAB_FILE, vocab_extended)