Trains the Paragraph Vector DM and DBOW models

In [None]:
import os
from os import path
import sys
import pickle
from collections import namedtuple, defaultdict, Counter
from datetime import datetime, timedelta
from time import time
import pandas as pd
import numpy as np
from random import shuffle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

pd.options.display.max_rows = 100

In [None]:
dir_data = '/path/to/data/dir'
file_pcs = path.join(dir_data, 'patient_code_sequences.txt')
file_persons = path.join(dir_data, 'persons.csv')
file_concepts = path.join(dir_data, 'concepts.csv')
file_sequences = path.join(dir_data, 'patient_sequences.pkl')
file_backup_suffix = '.backup'

### Load data into dataframe

In [None]:
# Load the persons
df_persons = pd.read_csv(file_persons, sep='\t', header=0, index_col=0, 
                         parse_dates=['birth_date'], infer_datetime_format=True)

# Load the concept definitions
df_concepts = pd.read_csv(file_concepts, sep='\t', header=0, index_col='concept_id')

### Load patient sequences into TaggedDocuments

In [None]:
# Helpers for reading in the patient_code_sequences.txt

# Date of occurrence and list of concepts occurring on this date
DateOccurrence = namedtuple('DateOccurrence', ['date', 'concept_ids'])

def _process_pcs_line(line):
    """ Processes a line from patient_code_sequences.txt and parses out the patient ID
    and DateOccurrences """
    split = line.strip().split('\t')
        
    # person_id is the first entry
    pid = int(split.pop(0))
    
    # Process the remaining string into a list of Occurrences
    date_occurrences = [_process_date_occurrence_str(x) for x in split]
    
    return pid, date_occurrences

def _process_date_occurrence_str(dos):
    """ Processes a DateOccurrence string 
    format: YYYY-MM-DD:<list of concept IDs separated by commas> """
    date_str, concept_ids_str = dos.split(':')
    occ = DateOccurrence(datetime.strptime(date_str.strip(), '%Y-%m-%d'), 
                         [int(x) for x in concept_ids_str.split(',')])
    return occ

def create_patient_sequences(f_pcs_in, f_seq_out=None, min_seq_length=10, randomize_order=True, verbose=False, save_intermediates=False): 
    """ Reads the patient_code_sequences.txt file and parses it into sequences for each patient
    
    Note: save_intermediates makes it a lot slower """

    # For keeping track of processing time
    t1 = time()

    # pseqs - list of TaggedDocument(words=[concept_ids], tags=[person_id])
    pseqs = list()

    count = 0
    
    if f_seq_out:
        f_intermediate = f_seq_out + '.tmp'
    
    # Read patient_code_sequences.txt
    with open(f_pcs_in) as fh:  
        # Skip the heaer line
        fh.readline()
        
        for line in fh:
            # Parse the line into person_id and list of date_occurrences
            pid, date_occurrences = _process_pcs_line(line)

            # Combine sequence of concepts from each date into on sequence for the patient
            current_seq = []
            for date_occurrence in date_occurrences:
                concepts = date_occurrence.concept_ids
                if randomize_order:
                    # Randomize the order of concepts occurring on the same date. Shuffle is applied in place
                    shuffle(concepts)
                    
                current_seq += concepts
                
            if len(current_seq) >= min_seq_length:
                pseqs.append(TaggedDocument(words=[str(x) for x in current_seq], tags=[pid]))

            # Display progress
            count += 1
            if count % 100000 == 0:
                if verbose: 
                    # Processing time and size of data structure
                    ellapsed_time = (time() - t1) / 60
                    print(f'{count} - {ellapsed_time:.01f} min')

                if save_intermediates and f_seq_out:
                    # Save a backup copy of the data
                    pickle.dump(pseqs, open(f_intermediate, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)      

    if f_seq_out:
        # Save the concept age distributions            
        pickle.dump(pseqs, open(f_seq_out, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

        # Delete the backup file
        if save_intermediates and path.exists(f_intermediate):
            os.remove(f_intermediate)

    # Display overall processing time
    ellapsed_time = (time() - t1) / 60
    print(f'{count} - {ellapsed_time:.01f} min')
    
    return pseqs

In [None]:
pseqs = create_patient_sequences(file_pcs, f_seq_out=None, min_seq_length=5, randomize_order=True, 
                                        verbose=True, save_intermediates=False)
n_pseqs = len(pseqs)
print(n_pseqs)

### Train the models

In [None]:
def model_filename(model, epochs=None):
    """ Generate a filename for to save the model using the string representation of the model, 
    which already includes most of the important model parameters. """
    f_model = re.sub('[^\w\-_\. ]', '_', str(model))
    if epochs:
        f_model += f'e{epochs}'
    f_model += datetime.now().strftime("_%Y-%m-%d")
    f_model += '.d2v'
    return f_model

#### Paragraph Vector - Distributed Memory

In [None]:
model_dm = Doc2Vec(dm=1, vector_size=100, window=7, min_count=5, alpha=0.023, hs = 0, negative=15, 
                   epochs=20, workers=6, report_delay=60)

# Build Vocab
t1 = time()
model_dm.build_vocab(pseqs, progress_per=1000000)
ellapsed_time = (time() - t1) / 60
print(f'Build Vocab Ellapsed Time: {ellapsed_time} min')

# Train
t1 = time()
model_dm.train(pseqs, total_examples=model_dm.corpus_count, epochs=model_dm.epochs, report_delay=60)
ellapsed_time = (time() - t1) / 60
print(f'Train Ellapsed Time: {ellapsed_time} min')

# Save the model
f_model = path.join(dir_data, model_filename(model_dm, epochs=model_dm.epochs))
print(f'Saving model to: {f_model}')
model_dm.save(f_model)

#### Paragraph Vector - Distributed Bag of Words

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=100, window=7, min_count=5, alpha=0.023, hs = 0, negative=15, 
                     epochs=20, workers=6, report_delay=60)

# Build Vocab
t1 = time()
model_dbow.build_vocab(pseqs, progress_per=100000)
ellapsed_time = (time() - t1) / 60
print(f'Build Vocab Ellapsed Time: {ellapsed_time} min')

# Train
t1 = time()
model_dbow.train(pseqs, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs, report_delay=60)
ellapsed_time = (time() - t1) / 60
print(f'Train Ellapsed Time: {ellapsed_time} min')

# Save the model
f_model = path.join(dir_data, model_filename(model_dbow, epochs=model_dbow.epochs))
print(f'Saving model to: {f_model}')
model_dbow.save(f_model)