In [2]:
import pandas as pd
from concept import Concept

In [3]:
concepts = pd.read_csv('/workspaces/master_thesis/CONCEPT.csv', on_bad_lines="skip", delimiter="\t", low_memory=False)

In [4]:
synonyms = pd.read_csv('/workspaces/master_thesis/CONCEPT_SYNONYM.csv', on_bad_lines="skip", delimiter="\t", low_memory=False)

In [5]:
concepts=Concept.concatenate_concept_with_their_synonyms(concepts, synonyms, ['SNOMED'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concepts["concept_name"] = (


In [6]:
concepts.names

['radiating chest pain',
 'urine tryptophan:creatinine ratio',
 'nu-hope belt left small 6460c 70cm-78cm length, 225mm width, 80mm opening (nu-hope laboratories inc) 1 device',
 'urine threonine:creatinine ratio',
 'nu-hope belt right large 6457a 90cm-100cm length, 200mm width, 68mm opening (nu-hope laboratories inc)',
 'urine taurine:creatinine ratio',
 'nu-hope belt left small 6460dc 70cm-78cm length, 225mm width, 72mm opening (nu-hope laboratories inc)',
 'urine phenylalanine:creatinine ratio',
 'nu-hope belt left small 6460dc 70cm-78cm length, 225mm width, 72mm opening (nu-hope laboratories inc) 1 device',
 'urine ornithine:creatinine ratio',
 'nu-hope belt right large 6457a 90cm-100cm length, 200mm width, 68mm opening (nu-hope laboratories inc) 1 device',
 'urine methionine:creatinine ratio',
 'nu-hope belt left small 6460e 70cm-78cm length, 225mm width, 50mm opening (nu-hope laboratories inc)',
 'urine lysine:creatinine ratio',
 'nu-hope belt right large 6457b 90cm-100cm length, 

In [7]:
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
import re

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will', 'cm', 'mm', 'ml']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Remove numbers
    string = re.sub('[0-9]+', '', string)

    # Remove some expressions
    string = re.sub('[!@#$+=]', '', string)

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    doc=nlp(string)

    ## Lemmatizer
    #string = " ".join([token.lemma_ for token in doc if not token.is_stop])
    

    return string 

In [13]:
all_text = []

for text in concepts.names:

    # Cleaning the text
    text = clean_text(text)

    # Appending to the all text list
    all_text.append(text.split())

In [18]:
from gensim.models import Word2Vec
model = Word2Vec(all_text, min_count=5, vector_size= 100, workers=3, window=3, sg = 0)

In [24]:
model.train(all_text, total_examples=model.corpus_count, epochs=50)

(385031034, 537933700)

In [27]:
model.save('word2vec.model')

In [30]:
model.wv.similarity('chest', 'pain')

0.2991555

In [33]:
model.wv.similarity('chest', 'level')

-0.0073986575

In [43]:
model.wv.similarity('chest', 'pain')

0.2991555

In [37]:
model.wv.most_similar('chest')

[('abdomen', 0.8219559788703918),
 ('abdominal', 0.8088624477386475),
 ('thorax', 0.7821133732795715),
 ('nasopharynx', 0.6966677308082581),
 ('oropharynx', 0.6940116882324219),
 ('pelvis', 0.6612889170646667),
 ('back', 0.6485032439231873),
 ('orbit', 0.6337751746177673),
 ('face', 0.6258650422096252),
 ('hypopharynx', 0.6215260028839111)]

In [42]:
for list in all_text:
    if 'chest' in list:
        print(list)

['radiating', 'chest', 'pain']
['contusion', 'of', 'chest']
['acute', 'chest', 'pain']
['chest', 'wall', 'pain']
['atypical', 'chest', 'pain']
['chest', 'wall', 'tenderness']
['ultrasonography', 'of', 'chest', 'abdomen']
['ultrasonography', 'of', 'chest', 'abdomen', 'pelvis']
['primary', 'less', 'than', 'ml', 'liposuction', 'of', 'chest']
['primary', 'ml', 'liposuction', 'of', 'chest']
['primary', 'ml', 'liposuction', 'of', 'chest']
['primary', 'more', 'than', 'ml', 'liposuction', 'of', 'chest']
['primary', 'pneumatically', 'assisted', 'less', 'than', 'ml', 'liposuction', 'of', 'chest']
['primary', 'pneumatically', 'assisted', 'ml', 'liposuction', 'of', 'chest']
['primary', 'pneumatically', 'assisted', 'ml', 'liposuction', 'of', 'chest']
['primary', 'pneumatically', 'assisted', 'more', 'than', 'ml', 'liposuction', 'of', 'chest']
['primary', 'external', 'ultrasound', 'assisted', 'less', 'than', 'ml', 'liposuction', 'of', 'chest']
['primary', 'external', 'ultrasound', 'assisted', 'ml', '

### Word2Vec embedding: pre-trained version + SNOMED intersection

Load Google model first 

In [47]:
from gensim.models import Word2Vec
 
#sentences = [["bad","robots"],["good","human"],['yes', 'this', 'is', 'the', 'word2vec', 'model']]
 
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
word2vec_model = Word2Vec(all_text, vector_size = 300, window=5,
min_count = 1, workers = 2)
 
word2vec_model.build_vocab(all_text)
 
# assign the vectors to the vocabs that are in Google's pre-trained model and your sentences defined above.
 
# lockf needs to be set to 1.0 to allow continued training.
 
word2vec_model.wv.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
 
# continue training with you own data
 
word2vec_model.train(all_text, total_examples=word2vec_model.corpus_count, epochs = 50)

FileNotFoundError: [Errno 2] No such file or directory: './word2vec/GoogleNews-vectors-negative300.bin'

TODO: 

- do lemmatisation with deleting all stop words
- compare results of the emmbeding 
- concatenate it with pre-trained embedding
- compare results 
- visualize

Questions: 

- how to make the transfer learning properly for the word2vec embedding 
- just update weights?
- or make itersection?
- is it possible to check how good is your embedding?
- or we can just create bigger corpus (question to think)