In [1]:
import sys
sys.path.append('/workspaces/de.uke.iam.automapping/src/automapping/')
import pandas as pd


#data preparation
from loader import ExcelLoader
from translator import HuggingFace
from language import Language
from preprocessor import EntityExtractor
from preprocessor import Abbreviations

#concepts
from concept import Concept

#mapping
from mapper import TfIdf
from detections import Predictions


  from .autonotebook import tqdm as notebook_tqdm


## Data Preporation

We will work with German medical-related phrases and therefore data-preporation steps will include:
1. Loading the data from Excel file
2. Replace German Medical abbreviations 
3. Translation with Hugging Face model 
4. NLP preprocessing: lemmatisation, lowercasing, punctuation and stop words removal

#### Loading the data from Excel file

In [2]:
data_file = "/workspaces/de.uke.iam.automapping/data_example/example_data.xlsx"
identifiers, variables =ExcelLoader(data_file, 'Kurzname', 'Langname', Language.GERMAN)()

In [3]:
identifiers

0    AMA_934
1    VAE_495
2    NDJ_956
3    DJD_333
Name: Kurzname, dtype: object

In [4]:
variables

0                           Kongenitale Herzerkrankung
1    Wurde Kontrastmittel für das Kardio MRT verwen...
2                     Wird ein mobiles EKG ausgegeben?
3    Ist bei Ihnen eine vom Arzt diagnostizierte KH...
Name: Langname, dtype: object

#### Replace German Medical abbreviations 
In this implementation, we used a table with abbreviations and their descriptions from [Medizinische_Abkürzungen](https://www.bionity.com/de/lexikon/Medizinische_Abk%C3%BCrzungen.html) to replace them. 


In [5]:
#Load abbreviations with desciption from Excel file
abbreviation_file = "/workspaces/de.uke.iam.automapping/data_example/german_abbreviation.xlsx"
abbreviation = Abbreviations.load_abbreviations(abbreviation_file, 'Abbreviation', 'Description')

#### Translation with Hugging Face model 

In [6]:
#Preparing translation model
model_translator=HuggingFace(Language.GERMAN, Language.ENGLISH)

In [7]:
translated_phrases=model_translator.translate(variables, abbreviation)

In [8]:
#first five translated phrases
translated_phrases[:5]

['Congenital heart disease',
 'Was contrast agent used for cardio magnetic resonance imaging?',
 'Is a mobile electrocardiogram issued?',
 'Are you aware of a coronary heart disease diagnosed by your doctor?']

#### NLP preprocessing: lemmatisation, lowercasing, punctuation and stop words removal

In [9]:
#Prepatring entity extraction model (lemmatisation, stop words removal, etc)
model_entity=EntityExtractor()

In [10]:
prep_data=model_entity(translated_phrases)

In [11]:
list_of_prep_data = list(prep_data)

In [12]:
list_of_prep_data[:5]

['congenital heart disease',
 'contrast agent cardio magnetic resonance imaging',
 'mobile electrocardiogram issue',
 'aware coronary heart disease diagnose']

## Concepts preporation 

Files were downloaded from [Athena](https://athena.ohdsi.org/search-terms/start) website 

In [14]:
concept_file = "/workspaces/de.uke.iam.automapping/src/automapping/CONCEPT.csv"
concepts = pd.read_csv(concept_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
concepts.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,1146945,concept.concept_id,Metadata,CDM,Field,S,CDM1,20141111,20991231,
1,1146954,concept.invalid_reason,Metadata,CDM,Field,S,CDM10,20141111,20991231,
2,1147044,observation_period.observation_period_id,Metadata,CDM,Field,S,CDM100,20141111,20991231,
3,756315,metadata.metadata_type_concept_id,Metadata,CDM,Field,S,CDM1000,20210925,20991231,
4,756316,metadata.name,Metadata,CDM,Field,S,CDM1001,20210925,20991231,


In [15]:
synonym_file = "/workspaces/de.uke.iam.automapping/src/automapping/CONCEPT_SYNONYM.csv"
synonyms = pd.read_csv(synonym_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
synonyms.head()

Unnamed: 0,concept_id,concept_synonym_name,language_concept_id
0,36674183,Goal Attainment Scaling-Light score,4180186
1,36674183,Goal Attainment Scaling-Light score (observabl...,4180186
2,36674184,Assessment using Goal Attainment Scaling-Light,4180186
3,36674184,Assessment using Goal Attainment Scaling-Light...,4180186
4,36674185,Burn of eye proper (disorder),4180186


In [16]:
vocabulary_file = "/workspaces/de.uke.iam.automapping/src/automapping/VOCABULARY.csv"
vocabulary = pd.read_csv(vocabulary_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
vocabulary.head()

Unnamed: 0,vocabulary_id,vocabulary_name,vocabulary_reference,vocabulary_version,vocabulary_concept_id
0,,OMOP Standardized Vocabularies,OMOP generated,v5.0 04-FEB-22,44819096
1,Visit Type,OMOP Visit Type,OMOP generated,,44819150
2,OSM,OpenStreetMap,"https://www.openstreetmap.org/copyright/en, ht...",OSM Release 2019-02-21,32541
3,Type Concept,OMOP Type Concept,OMOP generated,Type Concept 20210212,32808
4,Note Type,OMOP Note Type,OMOP generated,,44819146


In [17]:
concepts=Concept.concatenate_concept_with_their_synonyms(concepts, synonyms, vocabulary, 'SNOMED')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concepts["concept_name"] = (


### Mapping

In [18]:
model_map=TfIdf(concepts)

In [19]:
mapping=model_map(list_of_prep_data, list(identifiers))

In [20]:
df=Predictions.to_df(mapping, 5)

In [21]:
df.head()

Unnamed: 0,SourceID,SourceName,targetConceptName,targetConceptID,targetConceptCode,targetDomainID,targetVocabularyVersion,MatchScore
0,AMA_934,congenital heart disease,congenital heart disease,312723,13213009,Condition,2020-07-31 SNOMED CT International Edition; 20...,1.0
1,AMA_934,congenital heart disease,congenital heart disease service,44811433,893341000000106,Observation,2020-07-31 SNOMED CT International Edition; 20...,0.851862
2,AMA_934,congenital heart disease,heart disease,321588,56265001,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.831279
3,AMA_934,congenital heart disease,congenital heart disease in pregnancy,4129018,237227006,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.779252
4,AMA_934,congenital heart disease,congenital disease,440508,66091009,Condition,2020-07-31 SNOMED CT International Edition; 20...,0.774614


In [22]:
df.to_csv('/workspaces/de.uke.iam.automapping/data_example/mapping.csv', index=False)