In [11]:
import pandas as pd
from concept import Concept

## Prepare corpus 

In [12]:
concepts = pd.read_csv('/workspaces/master_thesis/CONCEPT.csv', on_bad_lines="skip", delimiter="\t", low_memory=False)

In [13]:
synonyms = pd.read_csv('/workspaces/master_thesis/CONCEPT_SYNONYM.csv', on_bad_lines="skip", delimiter="\t", low_memory=False)

In [14]:
concepts=Concept.concatenate_concept_with_their_synonyms(concepts, synonyms, ['SNOMED'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concepts["concept_name"] = (


In [15]:
concepts.names

['radiating chest pain',
 'urine tryptophan:creatinine ratio',
 'nu-hope belt left small 6460c 70cm-78cm length, 225mm width, 80mm opening (nu-hope laboratories inc) 1 device',
 'urine threonine:creatinine ratio',
 'nu-hope belt right large 6457a 90cm-100cm length, 200mm width, 68mm opening (nu-hope laboratories inc)',
 'urine taurine:creatinine ratio',
 'nu-hope belt left small 6460dc 70cm-78cm length, 225mm width, 72mm opening (nu-hope laboratories inc)',
 'urine phenylalanine:creatinine ratio',
 'nu-hope belt left small 6460dc 70cm-78cm length, 225mm width, 72mm opening (nu-hope laboratories inc) 1 device',
 'urine ornithine:creatinine ratio',
 'nu-hope belt right large 6457a 90cm-100cm length, 200mm width, 68mm opening (nu-hope laboratories inc) 1 device',
 'urine methionine:creatinine ratio',
 'nu-hope belt left small 6460e 70cm-78cm length, 225mm width, 50mm opening (nu-hope laboratories inc)',
 'urine lysine:creatinine ratio',
 'nu-hope belt right large 6457b 90cm-100cm length, 

In [16]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [17]:
#python -m spacy download en_core_web_sm #put it inside docker

In [18]:
import spacy

In [19]:
nlp = spacy.load("en_core_web_sm")

In [20]:
import re

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will', 'cm', 'mm', 'ml']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Remove numbers
    string = re.sub('[0-9]+', '', string)

    # Remove some expressions
    string = re.sub('[!@#$+=]', '', string)

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    doc=nlp(string)

    ## Lemmatizer
    string = " ".join([token.lemma_ for token in doc if not token.is_stop])
    

    return string 

In [21]:
all_text = []

for text in concepts.names:

    # Cleaning the text
    text = clean_text(text)

    # Appending to the all text list
    all_text.append(text.split())

In [22]:
all_text

[['radiate', 'chest', 'pain'],
 ['urine', 'tryptophancreatinine', 'ratio'],
 ['nuhope',
  'belt',
  'leave',
  'small',
  'c',
  'cmcm',
  'length',
  'width',
  'opening',
  'nuhope',
  'laboratories',
  'inc',
  'device'],
 ['urine', 'threoninecreatinine', 'ratio'],
 ['nuhope',
  'belt',
  'right',
  'large',
  'cmcm',
  'length',
  'width',
  'opening',
  'nuhope',
  'laboratories',
  'inc'],
 ['urine', 'taurinecreatinine', 'ratio'],
 ['nuhope',
  'belt',
  'leave',
  'small',
  'dc',
  'cmcm',
  'length',
  'width',
  'opening',
  'nuhope',
  'laboratories',
  'inc'],
 ['urine', 'phenylalaninecreatinine', 'ratio'],
 ['nuhope',
  'belt',
  'leave',
  'small',
  'dc',
  'cmcm',
  'length',
  'width',
  'opening',
  'nuhope',
  'laboratories',
  'inc',
  'device'],
 ['urine', 'ornithinecreatinine', 'ratio'],
 ['nuhope',
  'belt',
  'right',
  'large',
  'cmcm',
  'length',
  'width',
  'opening',
  'nuhope',
  'laboratories',
  'inc',
  'device'],
 ['urine', 'methioninecreatinine', 'r

In [23]:
import pickle
with open("test", "wb") as fp:   #Pickling
  pickle.dump(all_text, fp)

In [24]:
with open("test", "rb") as fp:   # Unpickling
  b = pickle.load(fp)

Check some evaluation metrics for example anology task and how word2vec works with sentences 

## Small experiments with only SNOMED corpus 

In [26]:
from gensim.models import Word2Vec
model = Word2Vec(all_text, min_count=5, vector_size= 100, workers=3, window=3, sg = 0)

In [27]:
model.train(all_text, total_examples=model.corpus_count, epochs=50)

(348443097, 489164800)

In [28]:
model.save('word2vec_1.model')

In [29]:
model.wv.similarity('chest', 'pain')

0.12593678

In [30]:
model.wv.similarity('chest', 'level')

-0.02217982

In [32]:
model.wv.similarity('chest', 'pain')

0.12593678

In [33]:
model.wv.most_similar('chest')

[('abdominal', 0.7387703061103821),
 ('thorax', 0.6483594179153442),
 ('abdoman', 0.6319321393966675),
 ('abdomen', 0.5777202844619751),
 ('inferoposterior', 0.5271478295326233),
 ('oropharynx', 0.5188896059989929),
 ('pellitory', 0.51680988073349),
 ('trachea', 0.5071685910224915),
 ('tragus', 0.48487192392349243),
 ('nasopharynx', 0.47951093316078186)]

## Evaluation 

In [34]:
!wget https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt

--2023-01-24 18:17:08--  https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 603955 (590K) [text/plain]
Saving to: ‘questions-words.txt’


2023-01-24 18:17:09 (5.72 MB/s) - ‘questions-words.txt’ saved [603955/603955]



In [35]:
model.wv.evaluate_word_analogies('questions-words.txt')

(0.03454231433506045,
 [{'section': 'capital-common-countries',
   'correct': [('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'),
    ('OTTAWA', 'CANADA', 'PARIS', 'FRANCE')],
   'incorrect': [('BANGKOK', 'THAILAND', 'BEIJING', 'CHINA'),
    ('BANGKOK', 'THAILAND', 'BERLIN', 'GERMANY'),
    ('BANGKOK', 'THAILAND', 'BERN', 'SWITZERLAND'),
    ('BANGKOK', 'THAILAND', 'HAVANA', 'CUBA'),
    ('BANGKOK', 'THAILAND', 'LONDON', 'ENGLAND'),
    ('BANGKOK', 'THAILAND', 'MADRID', 'SPAIN'),
    ('BANGKOK', 'THAILAND', 'OTTAWA', 'CANADA'),
    ('BANGKOK', 'THAILAND', 'PARIS', 'FRANCE'),
    ('BANGKOK', 'THAILAND', 'STOCKHOLM', 'SWEDEN'),
    ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'),
    ('BEIJING', 'CHINA', 'BERN', 'SWITZERLAND'),
    ('BEIJING', 'CHINA', 'HAVANA', 'CUBA'),
    ('BEIJING', 'CHINA', 'LONDON', 'ENGLAND'),
    ('BEIJING', 'CHINA', 'MADRID', 'SPAIN'),
    ('BEIJING', 'CHINA', 'OTTAWA', 'CANADA'),
    ('BEIJING', 'CHINA', 'PARIS', 'FRANCE'),
    ('BEIJING', 'CHINA', 'STOCKHOLM', 'SWEDEN'),
   

In [41]:
model.wv.evaluate_word_analogies('/workspaces/master_thesis/analogy_questions_umls.txt')

(0.006269449020684606,
 [{'section': 'UMLS_causative',
   'correct': [('SUNBURN', 'SUNLIGHT', 'SCHISTOSOMIASIS', 'SCHISTOSOMA'),
    ('TYPHUS', 'RICKETTSIA', 'SCHISTOSOMIASIS', 'SCHISTOSOMA'),
    ('TYPHUS', 'RICKETTSIA', 'EHRLICHIOSIS', 'EHRLICHIA'),
    ('TOXOCARIASIS', 'TOXOCARA', 'EHRLICHIOSIS', 'EHRLICHIA'),
    ('TRYPANOSOMIASIS', 'TRYPANOSOMA', 'SCHISTOSOMIASIS', 'SCHISTOSOMA'),
    ('VIREMIA', 'VIRUS', 'SCHISTOSOMIASIS', 'SCHISTOSOMA'),
    ('TOXOCARIASIS', 'TOXOCARA', 'SCHISTOSOMIASIS', 'SCHISTOSOMA')],
   'incorrect': [('PSITTACOSIS', 'CHLAMYDIA', 'OSTEOMYELITIS', 'ORGANISM'),
    ('EHRLICHIOSIS', 'EHRLICHIA', 'COCCIDIOSIS', 'COCCIDIA'),
    ('SCHISTOSOMIASIS', 'TREMATODA', 'ASPERGILLOSIS', 'ASPERGILLUS'),
    ('POLIOMYELITIS', 'ENTEROVIRUS', 'OSTEOMYELITIS', 'ORGANISM'),
    ('MYIASIS', 'INSECTA', 'EHRLICHIOSIS', 'EHRLICHIA'),
    ('PYELONEPHRITIS', 'ORGANISM', 'LEISHMANIASIS', 'PROTOZOA'),
    ('PSITTACOSIS', 'CHLAMYDIA', 'PNEUMOCONIOSIS', 'DUST'),
    ('OSTEOMYELITIS', 'OR

## Word2Vec embedding: pre-trained version + SNOMED intersection

Load Google model first 

https://phdstatsphys.wordpress.com/2018/12/27/word2vec-how-to-train-and-update-it/

In [None]:
from gensim.models import Word2Vec
 
sentences = [["bad","robots"],["good","human"],['yes', 'this', 'is', 'the', 'word2vec', 'model']]
 
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
word2vec_model = Word2Vec(vector_size = 300, window=5,
min_count = 1, workers = 2)
 
word2vec_model.build_vocab(sentences)
 

# assign the vectors to the vocabs that are in Google's pre-trained model and your sentences defined above.
 
# lockf needs to be set to 1.0 to allow continued training.
 
word2vec_model.wv.intersect_word2vec_format('/workspaces/master_thesis/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
 
# continue training with you own data
 
word2vec_model.train(sentences, total_examples=3, epochs = 50)

IndexError: index 3 is out of bounds for axis 0 with size 1

TODO: 

- do lemmatisation with deleting all stop words
- compare results of the emmbeding 
- concatenate it with pre-trained embedding
- compare results 
- visualize

Questions: 

- how to make the transfer learning properly for the word2vec embedding 
- just update weights?
- or make itersection?
- is it possible to check how good is your embedding?
- or we can just create bigger corpus (question to think)


- overfitting in discussaion about word2vec: So one was about overfitting the word embeddings. I think in just creating the vectors, overfitting is only the case when your training datatset is way too small. I think this might be an interesting work:
