In [2]:
import spacy
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

In [3]:
nlp = spacy.load("en_core_web_sm")

### Version with stemming 

In [3]:
def preprocessing(sample):
    if isinstance(sample, str):
        sample = sample.lower()
        stemmer = PorterStemmer()
        token_list = []
        doc = nlp(sample)
        token_list = [stemmer.stem(token.text)
            for token in doc
                if not token.is_stop and not token.is_punct
            ]
        sentence = " ".join(token_list)
        return sentence
    else:
        print(f"Invalid input {sample}: expected a string.")
        return None

In [4]:
data=pd.read_csv('/workspaces/master_thesis/mapping/standard_concepts_with_synonyms.csv', on_bad_lines='skip', sep=',')

In [None]:
data['preprocessed'] = data['concept_name'].apply(preprocessing)

In [7]:
data['preprocessed_synonyms'] = data['concept_synonym_name'].apply(preprocessing)

Invalid input nan: expected a string.


In [8]:
data

Unnamed: 0,concept_id,concept_name,concept_synonym_name,preprocessed,preprocessed_synonyms
0,4001098,Radiating chest pain,Radiating chest pain (finding),radiat chest pain,radiat chest pain find
1,37392117,Urine tryptophan:creatinine ratio,Urine tryptophan:creatinine ratio (observable ...,urin tryptophan creatinin ratio,urin tryptophan creatinin ratio observ entiti
2,37398455,Urine threonine:creatinine ratio,Urine threonine:creatinine ratio (observable e...,urin threonin creatinin ratio,urin threonin creatinin ratio observ entiti
3,37392118,Urine taurine:creatinine ratio,Urine taurine:creatinine ratio (observable ent...,urin taurin creatinin ratio,urin taurin creatinin ratio observ entiti
4,37392119,Urine phenylalanine:creatinine ratio,Urine phenylalanine:creatinine ratio (observab...,urin phenylalanin creatinin ratio,urin phenylalanin creatinin ratio observ entiti
...,...,...,...,...,...
491640,37398450,Urine homocysteine:creatinine ratio,Urine homocysteine:creatinine ratio (observabl...,urin homocystein creatinin ratio,urin homocystein creatinin ratio observ entiti
491641,37398451,Urine aspartate:creatinine ratio,Urine aspartate:creatinine ratio (observable e...,urin aspart creatinin ratio,urin aspart creatinin ratio observ entiti
491642,37398452,Urine alanine:creatinine ratio,Urine alanine:creatinine ratio (observable ent...,urin alanin creatinin ratio,urin alanin creatinin ratio observ entiti
491643,37398453,Urine valine:creatinine ratio,Urine valine:creatinine ratio (observable entity),urin valin creatinin ratio,urin valin creatinin ratio observ entiti


In [9]:
data.to_csv('standard_concepts_with_synonyms_preprocessed.csv', index=False)

### Version without stemming 

In [4]:
def preprocessing_without_stemming(sample):
    if isinstance(sample, str):
        sample = sample.lower()
        token_list = []
        doc = nlp(sample)
        token_list = [token.text
            for token in doc
                if not token.is_stop and not token.is_punct
            ]
        sentence = " ".join(token_list)
        return sentence
    else:
        print(f"Invalid input {sample}: expected a string.")
        return None

In [5]:
data=pd.read_csv('/workspaces/master_thesis/mapping/standard_concepts_with_synonyms.csv', on_bad_lines='skip', sep=',')

In [6]:
data['preprocessed'] = data['concept_name'].apply(preprocessing_without_stemming)

In [7]:
data['preprocessed_synonyms'] = data['concept_synonym_name'].apply(preprocessing_without_stemming)

Invalid input nan: expected a string.


In [8]:
data

Unnamed: 0,concept_id,concept_name,concept_synonym_name,preprocessed,preprocessed_synonyms
0,4001098,Radiating chest pain,Radiating chest pain (finding),radiating chest pain,radiating chest pain finding
1,37392117,Urine tryptophan:creatinine ratio,Urine tryptophan:creatinine ratio (observable ...,urine tryptophan creatinine ratio,urine tryptophan creatinine ratio observable e...
2,37398455,Urine threonine:creatinine ratio,Urine threonine:creatinine ratio (observable e...,urine threonine creatinine ratio,urine threonine creatinine ratio observable en...
3,37392118,Urine taurine:creatinine ratio,Urine taurine:creatinine ratio (observable ent...,urine taurine creatinine ratio,urine taurine creatinine ratio observable entity
4,37392119,Urine phenylalanine:creatinine ratio,Urine phenylalanine:creatinine ratio (observab...,urine phenylalanine creatinine ratio,urine phenylalanine creatinine ratio observabl...
...,...,...,...,...,...
491640,37398450,Urine homocysteine:creatinine ratio,Urine homocysteine:creatinine ratio (observabl...,urine homocysteine creatinine ratio,urine homocysteine creatinine ratio observable...
491641,37398451,Urine aspartate:creatinine ratio,Urine aspartate:creatinine ratio (observable e...,urine aspartate creatinine ratio,urine aspartate creatinine ratio observable en...
491642,37398452,Urine alanine:creatinine ratio,Urine alanine:creatinine ratio (observable ent...,urine alanine creatinine ratio,urine alanine creatinine ratio observable entity
491643,37398453,Urine valine:creatinine ratio,Urine valine:creatinine ratio (observable entity),urine valine creatinine ratio,urine valine creatinine ratio observable entity


In [9]:
data.to_csv('standard_concepts_with_synonyms_preprocessed_without_stemming.csv', index=False)