In [139]:
# Author: Dr. Steven C. Lindo
# Date: Fall 2023
# Desc: JumpStart Code for Students
# Revision History
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
# Name        Date          Description
# scl         10/28/2024    create NLP Review for Students
# scl         10/30/2024    refactor to read files from a google drive
# cc          11/9/2024     added code to intake data from snomed
#
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

In [140]:
import nltk
nltk.download('punkt')
nltk.download('brown')

# NER tool for python
import spacy
import re

#import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from string import digits, punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# load the tokenizer, tagger, ...from spacy
spacy_nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 2000)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [141]:
import os

from google.colab import drive
drive.mount('/content/drive')

location ='drive/MyDrive/SnomedData/'
list_of_files = os.listdir(location)
list_of_files

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['snomed0.txt',
 'snomed1.txt',
 'snomed2.txt',
 'snomed4.txt',
 'snomed8.txt',
 'snomed7.txt',
 'snomed9.txt',
 'snomed6.txt',
 'snomed5.txt',
 'snomed10.txt',
 'snomed14.txt',
 'snomed11.txt',
 'snomed13.txt',
 'snomed12.txt',
 'snomed3.txt',
 'snomed15.txt']

In [142]:
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
def f_cleanCorpus(c):
  cc = []

  for doc in c:
    doc = str(doc.lower())
    remove_digits = str.maketrans('', '', digits)
    remove_pipes = str.maketrans('','','|')
    clean_doc = doc.translate(remove_digits)
    clean_doc = clean_doc.translate(remove_pipes)
    clean_doc = clean_doc.replace(' ','')
    clean_doc = clean_doc.replace('\n','')


    cc.append(clean_doc)

  return cc
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
def loadCorpus(files):
  # empty corpus of abstracts
  corpus = []

  # create the content corpus
  for fn in files:
    f = open(location+fn, 'r')
    _text = f.read()
    _text.lower()
    corpus.append(_text)

  # clean the corpus
  clean_corpus = f_cleanCorpus(corpus)
  #print(clean_corpus)
  return clean_corpus
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
def checkIdx(idx,text):
  if idx == -1:
    idx = len(text)
  return idx

def f_createEntities(allCorpus):
  entities = []

  for corp in allCorpus:
    newDict = {}

    #get name
    nextIdx = ((corp.find("===")) if (corp.find("===") != -1) else (corp.find("<<<")))
    nextIdx = checkIdx(nextIdx,corp)
    newDict["name"] = corp[:nextIdx]

    #get isA relationships
    corp = corp[nextIdx+3:]
    nextIdx = corp.find(":")
    nextIdx = checkIdx(nextIdx,corp)
    currText = corp[:nextIdx]
    newDict["isA"] = currText.split("+")

    #get attributes
    corp = corp[nextIdx+2:]
    nextIdx = corp.find("}")
    nextIdx = checkIdx(nextIdx,corp)
    currText = corp[:nextIdx]
    attributes = currText.split(",")
    if len(attributes) == 0:
      entities.append(newDict)
      continue
    attDict = {}
    for att in attributes:
      splitAtt = att.split("=")
      if len(splitAtt) != 2:
        continue
      attDict[splitAtt[0]] = splitAtt[1]
    newDict["attributes"] = attDict

    entities.append(newDict)

  return entities
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

In [143]:
def main():
  list_entities = []

  content_corpus = loadCorpus(list_of_files)
  list_entities = f_createEntities(content_corpus)
  for ent in list_entities:
    print(ent)
    print("\n")

  return 0

In [144]:
r  = main()
r

{'name': 'cataract(disorder)', 'isA': ['cataractfinding(finding)', 'degenerativedisorderofeye(disorder)', 'disorderoflens(disorder)', 'lesionofeye(disorder)'], 'attributes': {'findingsite(attribute)': 'structureoflensofeye(bodystructure)', 'associatedmorphology(attribute)': 'abnormallyopaquestructure(morphologicabnormality)'}}


{'name': 'glaucoma(disorder)', 'isA': ['disorderofeyeproper(disorder)'], 'attributes': {'findingsite(attribute)': 'structureofeyeproper(bodystructure)'}}


{'name': 'age-relatedmaculardegeneration(disorder)', 'isA': ['degenerativedisorderofmacula(disorder)'], 'attributes': {'findingsite(attribute)': 'maculaluteastructure(bodystructure)', 'associatedmorphology(attribute)': 'degenerativeabnormality(morphologicabnormality)'}}


{'name': 'blurringofvisualimage(finding)', 'isA': ['findingofclarityofvisualimage(finding)', 'findingofheadregion(finding)'], 'attributes': {'findingsite(attribute)': 'eyeregionstructure(bodystructure)'}}


{'name': 'conventionalreleaseeyed

0