In [1]:
import sys
sys.path.append('/workspaces/de.uke.iam.automapping/src/automapping/')
import pandas as pd


#data preparation
from loader import ExcelLoader
from translator import HuggingFace
from language import Language
from preprocessor import SpacyPreprocessor
from preprocessor import AbbreviationReplacement

#concepts
from concepts import OmopConcepts

#mapping
from mapper import TfIdf
from detection import Predictions


  from .autonotebook import tqdm as notebook_tqdm


## Data Preporation

We will work with German medical-related phrases and therefore data-preparation steps will include:
1. Loading the data from Excel file
2. Replace German Medical abbreviations 
3. Translation with Hugging Face model 
4. NLP preprocessing: lemmatisation, lowercasing, punctuation and stop words removal

In [2]:
#Loading the data from Excel file
data_file = "/workspaces/de.uke.iam.automapping/data_example/example_data.xlsx"
samples = ExcelLoader(data_file, 'Kurzname', 'Langname').load(Language.GERMAN)

In [3]:
samples[0]

Sample(unique_id='AMA_934', content='Kongenitale Herzerkrankung', language=<Language.GERMAN: 'de'>, concepts=[])


In this implementation, we used a table with abbreviations and their descriptions from [Medizinische_Abkürzungen](https://www.bionity.com/de/lexikon/Medizinische_Abk%C3%BCrzungen.html) to replace them. 


In [4]:
#preparing abbreviation replacment and loading excel file with desciption from Excel file
abbreviation_file = "/workspaces/de.uke.iam.automapping/data_example/german_abbreviation.xlsx"
abbreviation = AbbreviationReplacement.load_abbreviations(abbreviation_file, 'Abbreviation', 'Description')

In [5]:
#Preparing translation model
model_translator=HuggingFace(Language.GERMAN, Language.ENGLISH)

In [6]:
#prepearing preprocessing (options: "lowercase", "stopwords", "punctuation", "lemmatization", "stemming")
model_spacy=SpacyPreprocessor(["lowercase", "stopwords", "punctuation", "lemmatization"])

In [7]:
for sample in samples:
    sample = abbreviation.transform(sample) #Step2
    sample = model_translator.translate(sample) #Step3
    sample = model_spacy.transform(sample) #Step4

In [8]:
samples[0]

Sample(unique_id='AMA_934', content='congenital heart disease', language=<Language.ENGLISH: 'en'>, concepts=[])

## Concepts preparation 

Files were downloaded from [Athena](https://athena.ohdsi.org/search-terms/start) website 

In [9]:
concept_file = "/workspaces/de.uke.iam.automapping/src/automapping/CONCEPT.csv"
concepts = pd.read_csv(concept_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
concepts.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,1146945,concept.concept_id,Metadata,CDM,Field,S,CDM1,20141111,20991231,
1,1146954,concept.invalid_reason,Metadata,CDM,Field,S,CDM10,20141111,20991231,
2,1147044,observation_period.observation_period_id,Metadata,CDM,Field,S,CDM100,20141111,20991231,
3,756315,metadata.metadata_type_concept_id,Metadata,CDM,Field,S,CDM1000,20210925,20991231,
4,756316,metadata.name,Metadata,CDM,Field,S,CDM1001,20210925,20991231,


In [10]:
synonym_file = "/workspaces/de.uke.iam.automapping/src/automapping/CONCEPT_SYNONYM.csv"
synonyms = pd.read_csv(synonym_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
synonyms.head()

Unnamed: 0,concept_id,concept_synonym_name,language_concept_id
0,36674183,Goal Attainment Scaling-Light score,4180186
1,36674183,Goal Attainment Scaling-Light score (observabl...,4180186
2,36674184,Assessment using Goal Attainment Scaling-Light,4180186
3,36674184,Assessment using Goal Attainment Scaling-Light...,4180186
4,36674185,Burn of eye proper (disorder),4180186


In [11]:
vocabulary_file = "/workspaces/de.uke.iam.automapping/src/automapping/VOCABULARY.csv"
vocabulary = pd.read_csv(vocabulary_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
vocabulary.head()

Unnamed: 0,vocabulary_id,vocabulary_name,vocabulary_reference,vocabulary_version,vocabulary_concept_id
0,,OMOP Standardized Vocabularies,OMOP generated,v5.0 04-FEB-22,44819096
1,Visit Type,OMOP Visit Type,OMOP generated,,44819150
2,OSM,OpenStreetMap,"https://www.openstreetmap.org/copyright/en, ht...",OSM Release 2019-02-21,32541
3,Type Concept,OMOP Type Concept,OMOP generated,Type Concept 20210212,32808
4,Note Type,OMOP Note Type,OMOP generated,,44819146


In [13]:
concepts=OmopConcepts.concatenate_concept_with_their_synonyms(concepts, synonyms, vocabulary, 'SNOMED')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concepts["concept_name"] = (


### Mapping

In [14]:
model_map=TfIdf(concepts)

In [15]:
df = pd.DataFrame()
for sample in samples:
    sample = model_map(sample)
    predictions = Predictions(sample)
    df_result=Predictions.to_df(predictions, 5)
    df = pd.concat([df, df_result], ignore_index=True)

In [16]:
df

Unnamed: 0,SourceID,SourceName,targetConceptName,targetConceptID,targetConceptCode,targetDomainID,targetVocabularyVersion,MatchScore
0,AMA_934,congenital heart disease,congenital heart disease,312723,13213009,Condition,2020-07-31 SNOMED CT International Edition; 20...,1.0
1,AMA_934,congenital heart disease,congenital heart disease service,44811433,893341000000106,Observation,2020-07-31 SNOMED CT International Edition; 20...,0.851862
2,AMA_934,congenital heart disease,heart disease,321588,56265001,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.831279
3,AMA_934,congenital heart disease,congenital heart disease in pregnancy,4129018,237227006,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.779252
4,AMA_934,congenital heart disease,congenital disease,440508,66091009,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.774614
5,VAE_495,contrast agent cardio magnetic resonance imaging,magnetic resonance imaging contrast medium agent,4169767,419909004,Device,2020-07-31 SNOMED CT International Edition; 20...,0.776471
6,VAE_495,contrast agent cardio magnetic resonance imaging,magnetic resonance imaging with contrast,4198856,51619007,Synonym,2020-07-31 SNOMED CT International Edition; 20...,0.694557
7,VAE_495,contrast agent cardio magnetic resonance imaging,magnetic resonance imaging of thigh with contrast,37397436,718071002,Synonym,2020-07-31 SNOMED CT International Edition; 20...,0.666171
8,VAE_495,contrast agent cardio magnetic resonance imaging,magnetic resonance imaging of knee with contrast,4332928,432719005,Synonym,2020-07-31 SNOMED CT International Edition; 20...,0.661451
9,VAE_495,contrast agent cardio magnetic resonance imaging,magnetic resonance imaging without contrast,4231864,90084008,Synonym,2020-07-31 SNOMED CT International Edition; 20...,0.651381


In [17]:
df.to_csv('/workspaces/de.uke.iam.automapping/data_example/mapping.csv', index=False)