In [1]:
# Author: Dr. Steven C. Lindo
# Date: Fall 2023
# Desc: JumpStart Code for Students
# Revision History
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
# Name        Date          Description
# scl         10/28/2024    create NLP Review for Students
# scl         10/30/2024    refactor to read files from a google drive
# rar         11/7/2024     Fetched data from API
#                           Cleaned data - removed stopwords
# rar         11/8/2024     Added lemmatization and thesaurus
#                           Implement KWIC
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

In [13]:
import nltk
nltk.download('punkt')
nltk.download('brown')

# NER tool for python
import spacy
import re

#import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from string import digits, punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# load the tokenizer, tagger, ...from spacy
spacy_nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 2000)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [15]:
import requests  # To make HTTP requests
import json      # To handle the JSON data

# Define the API endpoint and search parameters
url = "https://api.fda.gov/drug/label.json"
query = {
    "search": "indications_and_usage:(glaucoma OR conjunctivitis OR uveitis OR dry eye)",
    "limit": 100
}

# Make the GET request to fetch data
response = requests.get(url, params=query)

# Check if the request was successful
if response.status_code == 200:
    # Parse the json data
    data = response.json()

    print(json.dumps(data, indent=2))
else:
    print(f"Error: {response.status_code}")
    data = {}


{
  "meta": {
    "disclaimer": "Do not rely on openFDA to make decisions regarding medical care. While we make every effort to ensure that data is accurate, you should assume all results are unvalidated. We may limit or otherwise restrict your access to the API in line with our Terms of Service.",
    "terms": "https://open.fda.gov/terms/",
    "license": "https://open.fda.gov/license/",
    "last_updated": "2024-11-08",
    "results": {
      "skip": 0,
      "limit": 100,
      "total": 11106
    }
  },
  "results": [
    {
      "effective_time": "20160809",
      "drug_interactions": [
        "Drug Interactions In vitro studies were conducted to investigate the potential of gabapentin to inhibit the major cytochrome P450 enzymes (CYP1A2, CYP2A6, CYP2C9, CYP2C19, CYP2D6, CYP2E1, and CYP3A4) that mediate drug and xenobiotic metabolism using isoform selective marker substrates and human liver microsomal preparations. Only at the highest concentration tested (171 mcg/mL; 1 mM) was a 

In [32]:
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
# Clean
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')

def f_cleanCorpus(c):
  cc = []

  for doc in c:
    # Make lowercase, get rid of commas
    doc = str(doc.lower())
    doc = doc.translate(str.maketrans('', '',punctuation))
    remove_digits = str.maketrans('', '', digits)
    clean_doc = doc.translate(remove_digits)

    # -    -    Other things ToDo  -    -    -    -    -
    # 1. remove stop-words
    # Download the stop words, go through every word and if stop word,
    # don't add it to the filtered list
    doc = spacy_nlp(clean_doc)
    filtered_words = [token.text for token in doc if not token.is_stop]

    # 2. compress with lemm
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_doc = ' '.join(lemmatized_words)
    # 3. compress with thesaurus
    # -    -    -    -    -    -    -    -    -    -    -
    compressed_tokens = []

    for token in lemmatized_words:
      synonyms = wordnet.synsets(token)
      if synonyms:
          shortest_synonym = min(synonyms, key=lambda syn: len(syn.lemmas()[0].name()))
          compressed_tokens.append(shortest_synonym.lemmas()[0].name())
      else:
          compressed_tokens.append(token)
    clean_doc = ' '.join(compressed_tokens)
    cc.append(clean_doc)

  return cc
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
def loadCorpus(data):
  # empty corpus of abstracts
  corpus = []

# Check if results is a key in json data
  if "results" in data:
    for entry in data["results"]:
      if "indications_and_usage" in entry:
        text = " ".join(entry["indications_and_usage"])
        corpus.append(text)

  # clean the corpus
  clean_corpus = f_cleanCorpus(corpus)
  # Print cleaned corpus
  for idx, doc in enumerate(clean_corpus):
    print(f"Document {idx + 1}:\n{doc}\n{'-' * 40}")
  return clean_corpus
# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
def isRelevantDocument(doc):
    # List of keywords related to eye conditions
    eye_condition_keywords = [
        "glaucoma", "cataract", "macular degeneration", "amd", "diabetic retinopathy",
        "retinopathy", "dry eye", "uveitis", "keratitis", "conjunctivitis", "ocular hypertension"
    ]
    eye_medication_keywords = [
        "latanoprost", "bimatoprost", "travoprost", "timolol", "brimonidine", "dorzolamide",
        "acetazolamide", "triamcinolone", "bevacizumab", "ranibizumab", "pegaptanib",
        "aflibercept", "cyclopentolate", "tropicamide", "atropine", "prednisolone",
        "cyclosporine", "antiglaucoma", "mydriatic", "ophthalmic solution", "eye drop"
    ]
    relevant_keywords = eye_condition_keywords + eye_medication_keywords
    return any(keyword in doc for keyword in eye_condition_keywords)

# Key word in context
def kwic_ngram(text, phrase, window_size=5):
  # Generate the KWIC lines
  words = text.split()
  kwic_= []
  phrase_words = phrase.split()

  # Find the words we want
  for i in range(len(words)):
    if phrase in words[i]:
      start = max(i - window_size, 0)
      end = min(i + window_size + 1, len(words))

      context = ' '.join(words[start:end])
      kwic_.append(context)

  # // todo add your code here
  # //

  return kwic_

if data:
    clean_corpus = loadCorpus(data)

    # Filter the cleaned corpus for relevant documents
    filtered_corpus = [doc for doc in clean_corpus if isRelevantDocument(doc)]
    print("Filtered Corpus:")
    for doc in filtered_corpus:
        print(doc)

# -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Document 1:
reading   use gabapentin pad usp read medicine lead start aim gabapentin time refill new data data put talk healthcare supplier checkup condition treatment crucial data know gabapentin end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem gabapentin cause effect admit   like anticonvulsant drug gabapentin cause self-destructive idea action small act people     healthcare supplier good off symptom specially new bad worry idea suicide die try give suicide new bad low new bad anxiety feel stir restless panic fire fuss sleep insomnia new bad temper act aggressive angry fierce act dangerous urge extreme addition action talk mania unusual change behavior mood watch early symptom self-destructive idea action pay care change specially sudden change mood behavior idea feel follow-up visit healthcare supplier schedule healthcare supplier visit want specially worry symptom end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem end

In [33]:
print(data)



In [34]:
loadCorpus(data)

Document 1:
reading   use gabapentin pad usp read medicine lead start aim gabapentin time refill new data data put talk healthcare supplier checkup condition treatment crucial data know gabapentin end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem gabapentin cause effect admit   like anticonvulsant drug gabapentin cause self-destructive idea action small act people     healthcare supplier good off symptom specially new bad worry idea suicide die try give suicide new bad low new bad anxiety feel stir restless panic fire fuss sleep insomnia new bad temper act aggressive angry fierce act dangerous urge extreme addition action talk mania unusual change behavior mood watch early symptom self-destructive idea action pay care change specially sudden change mood behavior idea feel follow-up visit healthcare supplier schedule healthcare supplier visit want specially worry symptom end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem end

['reading   use gabapentin pad usp read medicine lead start aim gabapentin time refill new data data put talk healthcare supplier checkup condition treatment crucial data know gabapentin end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem gabapentin cause effect admit   like anticonvulsant drug gabapentin cause self-destructive idea action small act people     healthcare supplier good off symptom specially new bad worry idea suicide die try give suicide new bad low new bad anxiety feel stir restless panic fire fuss sleep insomnia new bad temper act aggressive angry fierce act dangerous urge extreme addition action talk mania unusual change behavior mood watch early symptom self-destructive idea action pay care change specially sudden change mood behavior idea feel follow-up visit healthcare supplier schedule healthcare supplier visit want specially worry symptom end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem end seizure m

In [35]:
text = "The treatment for glaucoma can include medication or surgery, depending on the severity of the disease."
phrase = "glaucoma"
window_size = 3

kwic_results = kwic_ngram(text, phrase, window_size)
for result in kwic_results:
    print(result)


The treatment for glaucoma can include medication


In [36]:
def main():
  list_entities = []
  list_kwic = []

  content_corpus = loadCorpus(data)
  for document in content_corpus:
    nlp_obj = spacy_nlp(document)

    # - scl -  use these to help debug issues
    spacy.displacy.render(nlp_obj, style="ent")
    # print(ent.text, ent.start_char, ent.end_char, ent.label_)
    # print(ent.label_, '\t', spacy.explain(ent.label_))

    for ent in nlp_obj.ents:
      list_entities.append([ent.text, ent.start_char, ent.end_char, ent.label_])

    df_entities = pd.DataFrame(list_entities, columns=['entity','start','end','label'] )
    #df_entities = df_entities[df_entities['label'] == 'PERSON']

  return df_entities

In [37]:
r  = main()
r

Document 1:
reading   use gabapentin pad usp read medicine lead start aim gabapentin time refill new data data put talk healthcare supplier checkup condition treatment crucial data know gabapentin end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem gabapentin cause effect admit   like anticonvulsant drug gabapentin cause self-destructive idea action small act people     healthcare supplier good off symptom specially new bad worry idea suicide die try give suicide new bad low new bad anxiety feel stir restless panic fire fuss sleep insomnia new bad temper act aggressive angry fierce act dangerous urge extreme addition action talk mania unusual change behavior mood watch early symptom self-destructive idea action pay care change specially sudden change mood behavior idea feel follow-up visit healthcare supplier schedule healthcare supplier visit want specially worry symptom end aim gabapentin talk healthcare supplier end gabapentin suddenly cause problem end



Unnamed: 0,entity,start,end,label
0,gabapentin,14,24,NORP
1,gabapentin time,62,77,LOC
2,gabapentin,174,184,NORP
3,gabapentin talk healthcare supplier end,193,232,PRODUCT
4,gabapentin,233,243,NORP
5,gabapentin,267,277,NORP
6,gabapentin,324,334,NORP
7,gabapentin talk healthcare supplier end,911,950,PRODUCT
8,gabapentin,951,961,NORP
9,gabapentin,1198,1208,NORP
