In [1]:
import os
import sys

module_path = os.path.abspath("..")
sys.path.append(module_path)

Let's start by creating a dataset of drugs and their known indications from MedRT, which is contained in UMLS. You can request a license and download the data from this location: https://www.ncbi.nlm.nih.gov/books/NBK9685/

In [2]:
from src.indication_finding.medrt_dataset.medrt_dataset import create_medrt_dataset
from src.indication_finding.medrt_dataset.medrt_dataset import UmlsRRFReader

reader = UmlsRRFReader("s3://digitalrnd-drf-projects-ana-ireland/integrative_clinical_data/UMLS_data/META/")

relationship_frame = reader.get_frame(
    "MRREL.RRF",
    col_descriptions=False,
    usecols=["CUI1", "CUI2", "RELA", "SAB"],
    chunksize=100000,
    selection="RELA == 'has_form' or RELA == 'may_treat'",
)
concepts = reader.get_frame(
    "MRCONSO.RRF",
    col_descriptions=False,
    selection="SAB=='MSH' or SAB=='ICD10CM' or SAB=='RXNORM'",
    chunksize=100000,
)

#
# cancer indications are described here on the icd10cm website here: https://www.icd10data.com/ICD10CM/Codes/C00-D49
indications = ["C{:02d}".format(i) for i in range(0, 97)]
indications += ["C7A", "C7B"]
indications += ["D{:02d}".format(i) for i in range(0, 10)]
dataset = create_medrt_dataset(concepts, relationship_frame, indications)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
dataset

Unnamed: 0,index,Compound_RXNORM_Label,Compound_RXNORM_SCUI,ICD10CM_CODE,ICD10CM_STR,selected_diseases_indication,CLEAN_ICD10CM_CODE,Compound_CUI
0,0,tretinoin microsphere,221175,L70.0,Acne vulgaris,False,l700,C0040845
1,1,tretinoin microsphere,221175,C92.4,Acute promyelocytic leukemia,True,c924,C0040845
2,2,"immunoglobulins, intravenous",42386,B20,acquired immune deficiency syndrome [AIDS],False,b20,C0020852
3,3,"immunoglobulins, intravenous",42386,B25,Cytomegaloviral disease,False,b25,C0020852
4,4,"immunoglobulins, intravenous",42386,M33,Dermatopolymyositis,False,m33,C0020852
...,...,...,...,...,...,...,...,...
242,242,regorafenib monohydrate,1371319,C49.A,Gastrointestinal stromal tumor,True,c49a,C2980094
243,243,regorafenib monohydrate,1371319,C22.0,Hepatocellular carcinoma,True,c220,C2980094
244,244,lutetium Lu 177 vipivotide tetraxetan,2597053,C61,Malignant neoplasm of prostate,True,c61,C5238124
245,245,sorafenib tosylate,597744,C22.0,Hepatocellular carcinoma,True,c220,C1516119


Looking at the above table, we list the RXNORM identifiers and string labels of various drugs. We also list the known indications of these drugs in ICD10CM.

To create an illustrative example of running matrix factorization on a set of patients ehrs, we create a mock dataset of person ids and events.

In [6]:
import numpy as np
import pandas as pd

all_events = list(set(dataset["ICD10CM_STR"].values))
all_persons = np.arange(0, 10000)

n_entries = 20 * max(all_persons)
random_codes = np.random.choice(all_events, size=n_entries)
random_persons = np.random.choice(all_persons, size=n_entries)

medical_events = {"person_id":random_persons, "feature_name":random_codes}
medical_events = pd.DataFrame.from_dict(medical_events)

medical_events

Unnamed: 0,person_id,feature_name
0,3943,Neutropenia
1,295,Hairy cell leukemia
2,1978,Chronic lymphocytic leukemia of B-cell type NOS
3,313,Acute basophilic leukemia
4,3498,Berylliosis
...,...,...
199975,6774,Mesothelioma
199976,170,Keratitis
199977,6955,Chronic neutrophilic leukemia
199978,3701,Hemangioma


We assume that the above table would usually be much larger including millions of patients, so our matrix factorization starts from a spark dataframe.

In [7]:
from src.train_models.mf_factory import main
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# load some libraries to read from s3
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
spark.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")

log4j:WARN No appenders could be found for logger (org.apache.spark.util.ShutdownHookManager).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.


In [9]:
medical_events_spark = spark.createDataFrame(medical_events)
medical_events_spark.show()

View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi
+---------+--------------------+
|person_id|        feature_name|
+---------+--------------------+
|     3943|         Neutropenia|
|      295| Hairy cell leukemia|
|     1978|Chronic lymphocyt...|
|      313|Acute basophilic ...|
|     3498|         Berylliosis|
|      981| Erythema multiforme|
|     7405| Crohn's disease NOS|
|     6352|Acute promyelocyt...|
|     2405|Seborrheic dermat...|
|     2649|Chronic neutrophi...|
|     1471|Rheumatism, unspe...|
|     6987|           Psoriasis|
|     9096|Ankylosing spondy...|
|     5132|    Cryoglobulinemia|
|     7267|Microscopic polya...|
|     4830|Microscopic polya...|
|     9214|Diffuse large B-c...|
|      303|Malignant neoplas...|
|     1636| Erythema multiforme|
|     6019|Essential (hemorr...|
+---------+--------------------+
only showing top 20 rows



In [11]:
from src.utils.utils import parse_yaml_args
args = parse_yaml_args("../config/training_mf.yaml")
args["test"] = False
embeddings, embeddings_ids, training_results = main(medical_events_spark, args)

Using config: ../config/training_mf.yaml
{'mf_embedding_size': [50], 'mf_regularization': [0.05], 'mf_max_iterations': [10000], 'mf_train_split': 0.9, 'mf_split_seed': 42, 'mf_distance_measure': 'cosine', 'mf_finetune_measure': 'similarity_variance', 'mf_finetune_workers': 1, 'mf_cooccurrence_embedding_size': [100], 'mf_cooccurrence_regularization': [0.0], 'mf_cooccurrence_max_iterations': [20], 'mf_cooccurrence_train_split': 0.1, 'mf_cooccurrence_split_seed': 42, 'mf_cooccurrence_distance_measure': 'cosine', 'mf_cooccurrence_finetune_measure': 'similarity_variance', 'test': 1}
View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi




+----+----+------+
|user|item|rating|
+----+----+------+
|   0|   1|     1|
|   0|   3|     1|
|   0|  10|     1|
|   0|  12|     1|
|   0|  21|     1|
|   0|  24|     1|
|   0|  35|     1|
|   0|  37|     1|
|   0|  38|     1|
|   0|  39|     1|
|   0|  39|     1|
|   0|  41|     1|
|   0|  42|     1|
|   0|  44|     1|
|   0|  55|     1|
|   0|  59|     1|
|   0|  62|     1|
|   0|  64|     1|
|   0|  65|     1|
|   0|  69|     1|
+----+----+------+
only showing top 20 rows

View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi


[Stage 8197:>                                                     (0 + 32) / 32]

+----+----+------+
|user|item|rating|
+----+----+------+
|   0|  16|     1|
|   0|  41|     1|
|   0|  42|     1|
|   0|  46|     1|
|   0|  48|     1|
|   0|  52|     1|
|   0|  52|     1|
|   0|  53|     1|
|   0|  53|     1|
|   0|  59|     1|
|   0|  61|     1|
|   0|  67|     1|
|   0|  70|     1|
|   0|  71|     1|
|   0|  75|     1|
|   0|  83|     1|
|   0|  83|     1|
|   0|  88|     1|
|   1|  16|     1|
|   1|  20|     1|
+----+----+------+
only showing top 20 rows

View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi




View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi
MF run embedding_size: 50
MF run regularization: 0.05
MF run max_iterations: 10000
MF model: <class 'sklearn.decomposition._nmf.NMF'>
MF fine-tune measure: similarity_variance
MF best embedding_size: 50
MF best regularization: 0.05
MF best max_iterations: 10000
MF best score: 0.003398227621992873
MF re-fitting on full dataset
MF run embedding_size: 50
MF run regularization: 0.05
MF run max_iterations: 10000
MF model: <class 'sklearn.decomposition._nmf.NMF'>


The above code creates embeddings for all features. We can see the embedding values in the folllowing table:

In [12]:
embeddings

Unnamed: 0,emb_vec_0,emb_vec_1,emb_vec_2,emb_vec_3,emb_vec_4,emb_vec_5,emb_vec_6,emb_vec_7,emb_vec_8,emb_vec_9,...,emb_vec_40,emb_vec_41,emb_vec_42,emb_vec_43,emb_vec_44,emb_vec_45,emb_vec_46,emb_vec_47,emb_vec_48,emb_vec_49
0,0.069004,0.067197,0.071140,0.050774,0.067386,0.060073,0.058972,0.053346,0.052885,0.048902,...,0.040903,0.036087,0.039878,0.043390,0.034059,0.038775,0.037143,0.035621,0.040177,0.033440
1,0.074345,0.072403,0.076660,0.054716,0.072629,0.064746,0.063560,0.057500,0.057000,0.052706,...,0.044060,0.038873,0.042957,0.046741,0.036689,0.041770,0.040013,0.038371,0.043283,0.036024
2,0.072791,0.070900,0.075079,0.053590,0.071134,0.063414,0.062251,0.056316,0.055825,0.051620,...,0.043110,0.038036,0.042032,0.045738,0.035903,0.040876,0.039159,0.037555,0.042361,0.035261
3,0.070463,0.068625,0.072661,0.051861,0.068838,0.061366,0.060242,0.054497,0.054024,0.049954,...,0.041752,0.036836,0.040707,0.044294,0.034768,0.039584,0.037919,0.036364,0.041019,0.034141
4,0.071539,0.069664,0.073751,0.052637,0.069867,0.062283,0.061141,0.055309,0.054831,0.050702,...,0.042399,0.037408,0.041337,0.044977,0.035305,0.040195,0.038503,0.036926,0.041651,0.034666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,0.069468,0.067650,0.071622,0.051118,0.067846,0.060479,0.059368,0.053707,0.053240,0.049229,...,0.041170,0.036323,0.040140,0.043677,0.034284,0.039032,0.037390,0.035857,0.040445,0.033663
88,0.071223,0.069360,0.073433,0.052411,0.069565,0.062011,0.060872,0.055068,0.054587,0.050474,...,0.042202,0.037234,0.041147,0.044773,0.035144,0.040013,0.038329,0.036758,0.041462,0.034509
89,0.068855,0.067058,0.070999,0.050676,0.067265,0.059966,0.058868,0.053255,0.052793,0.048818,...,0.040807,0.036002,0.039783,0.043289,0.033979,0.038685,0.037057,0.035539,0.040086,0.033365
90,0.069913,0.068096,0.072110,0.051472,0.068332,0.060923,0.059813,0.054112,0.053647,0.049609,...,0.041437,0.036558,0.040397,0.043956,0.034502,0.039281,0.037628,0.036086,0.040704,0.033879


We can also produce ranked lists of related similar indications given a query of two features: ["Atopic dermatitis", "Kaposi's sarcoma"].

In [13]:
from src.indication_finding.scoring.mf_ranking import get_mf_ranked_list
query = ["Atopic dermatitis", "Kaposi's sarcoma"]
get_mf_ranked_list(embeddings, embeddings_ids, pd.DataFrame(), query, [".*"])

View job details at https://sanofi-oneai-emea-prod.cloud.databricks.com/?o=3601573116687247#/setting/clusters/0125-203003-oabm3p5v/sparkUi


Unnamed: 0,feature_name,Atopic dermatitis_score,Kaposi's sarcoma_score,Atopic dermatitis_rank,Kaposi's sarcoma_rank,mean_score,median_score,mean_rank,median_rank,number_in_top_30
0,Acne vulgaris,1.0,1.0,14.0,50.0,1.0,1.0,32.0,32.0,1
62,Mycosis fungoides,1.0,1.0,15.0,48.0,1.0,1.0,31.5,31.5,1
14,Chronic lymphocytic leukemia of B-cell type NOS,1.0,1.0,18.0,46.0,1.0,1.0,32.0,32.0,1
21,Cytomegaloviral disease,1.0,1.0,23.0,33.0,1.0,1.0,28.0,28.0,1
77,Seborrheic dermatitis,1.0,1.0,24.0,31.0,1.0,1.0,27.5,27.5,1
...,...,...,...,...,...,...,...,...,...,...
90,choriocarcinoma NOS,1.0,1.0,88.0,83.0,1.0,1.0,85.5,85.5,0
33,Gastrointestinal stromal tumor,1.0,1.0,89.0,81.0,1.0,1.0,85.0,85.0,0
47,Juvenile arthritis,1.0,1.0,90.0,84.0,1.0,1.0,87.0,87.0,0
57,Mesothelioma,1.0,1.0,91.0,87.0,1.0,1.0,89.0,89.0,0
