### Verify Sentences Extracted from Articles

In [None]:
!pip install python-docx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184491 sha256=2d7b9435394c5eb4bdfbf1112ac0152b2270a4487caf88b1fdb7195dcae8ce4b
  Stored in directory: /root/.cache/pip/wheels/80/27/06/837436d4c3bd989b957a91679966f207bfd71d358d63a8194d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
from docx import Document
import nltk
nltk.download('punkt')
import re
from nltk import sent_tokenize
import pandas as pd
import pickle
import numpy as np
import glob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Verify if the get_docx file works correctly which was used to generate the 
# all_papers.json array
def get_docx(file_path):
    doc = []
    for para in Document(file_path).paragraphs:
        if para.text == "":
            continue
        doc += (sent_tokenize(para.text.lower()))
    return doc

test_file = "/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/2015/zoe_hesp._chronic_oligodendrogenesis_and_remyelination_after_spinal_cord_injury_in_mice_andrats.docx"
d = get_docx(test_file) 
print(d) # matches the document 

['neurobiology of disease', 'chronic oligodendrogenesis and remyelination after spinal cord injury in mice and rats', 'zoe c. hesp,1 evan z. goldstein,1 carlos j. miranda,4 brian k. kaspar,3,4 and dana m. mctigue2,3', '1neuroscience graduate studies program, 2department of neuroscience, and 3center for brain and spinal cord repair, the ohio state university, columbus, ohio 43210, and 4nationwide children’s hospital, columbus, ohio 43205', 'adult progenitor cells proliferate in the acutely injured spinal cord and their progeny differentiate into new oligodendrocytes (ols) that remyelinate spared axons.', 'whether this endogenous repair continues beyond the first week postinjury (wpi), however, is unknown.', 'identifying the duration of this response is essential for guiding therapies targeting improved recovery from spinal cord injury (sci) by enhancing ol survival and/or remyelination.', 'here, we used two pdgfra-reporter mouse lines and rats injected with a gfp-retrovirus to assess pr

In [None]:
# Get all the extracted sentences as a json file that has 700 arrays representing each article
# with each array containing the sentence of the article as sentences 
# NOTE: There is also a proc_all_year.json in the same folder that was used for GloVe training 
# which consists of lemmatization, filtering out small words, etc... 
import json
extracted_sentences_json = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/Optic Nerve Regeneration/Code/Word Embeddings/Generated Models/Extracted Sentences/"
with open(extracted_sentences_json + "all_years_array.json", "r") as f:
    all_docs = json.load(f)
len(all_docs)

700

In [None]:
# Ensure there are 700 research articles and manually check one 
import os 

global_path = "/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/"

def count_docx_files(folder_path):
    count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                count += 1
    return count

# Call the function with the global_path
file_count = count_docx_files(global_path)
print("Total .docx files found:", file_count)

print(all_docs[-3]) # index for test_file above 

Total .docx files found: 700
['neurobiology of disease', 'chronic oligodendrogenesis and remyelination after spinal cord injury in mice and rats', 'zoe c. hesp,1 evan z. goldstein,1 carlos j. miranda,4 brian k. kaspar,3,4 and dana m. mctigue2,3', '1neuroscience graduate studies program, 2department of neuroscience, and 3center for brain and spinal cord repair, the ohio state university, columbus, ohio 43210, and 4nationwide children’s hospital, columbus, ohio 43205', 'adult progenitor cells proliferate in the acutely injured spinal cord and their progeny differentiate into new oligodendrocytes (ols) that remyelinate spared axons.', 'whether this endogenous repair continues beyond the first week postinjury (wpi), however, is unknown.', 'identifying the duration of this response is essential for guiding therapies targeting improved recovery from spinal cord injury (sci) by enhancing ol survival and/or remyelination.', 'here, we used two pdgfra-reporter mouse lines and rats injected with 

In [None]:
# Collapse all the 700 essays into one array 
all_docs_1D = [j for sub in all_docs for j in sub]
len(all_docs_1D) # Number of total sentences

755496

### Generate Supervised Labels from Known Molecules

In [None]:
known_molecules_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP - Lab/Optic Nerve Regeneration/Data/KnownPromotersInhibitors.csv")
print(known_molecules_df[['PrimaryName', 'Other Names', 'Class']].head())

# Know store all the molecule names as an array 
molecule_names = []

# Iterate over rows
for index, row in known_molecules_df.iterrows():
    to_add = [] 
    to_add.append(row['PrimaryName'].strip().lower())  # Add molecule name from the first column
    
    # Split and add molecule names from the second column (skip over NaN)
    if isinstance(row['Other Names'], str):
      to_add.extend([name.strip().lower() for name in row['Other Names'].split(',')])

    molecule_names.append(to_add)
# Print the molecule names array
print(molecule_names)

# Get labels 
molecule_labels = [1 if m_class == "P" else 0 for m_class in known_molecules_df["Class"].values]
print(molecule_labels)

  PrimaryName Other Names Class
0    Netrin-1         NaN     P
1        EPHA    ephrin-A     I
2        EPHB    ephrin-B     I
3          L1         NaN     P
4     Laminin         NaN     P
[['netrin-1'], ['epha', 'ephrin-a'], ['ephb', 'ephrin-b'], ['l1'], ['laminin'], ['tenascin'], ['cspg', 'chondroitin sulphate'], ['zymosan'], ['(cpt)-camp', 'cpt-camp'], ['pten'], ['cntf', 'ciliary neurotrophic factor'], ['lif', 'leukemia inhibitory factor'], ['ocm', 'oncomodulin'], ['stat3', 'signal transducer and activator of transcription 3'], ['socs3', 'suppressor of cytokine signaling 3'], ['rhoa', 'ras homolog family member a'], ['rock', 'rho-associated protein kinase'], ['y27632'], ['c3'], ['nogoa', 'neurite outgrowth inhibitor'], ['klf-4', 'klf4', 'krüppel-like factor 4'], ['klf-9', 'klf9', 'krüppel-like factor 9'], ['klf-6', 'klf6', 'krüppel-like factor 6'], ['klf-7', 'klf7', 'krüppel-like factor 7'], ['c-myc', 'ceullar myelocytomatosis'], ['ngr', 'nogo receptor'], ['ptp-γ', 'ptp gamma', '

In [None]:
# Create a custom class that will allow us to easily store moleculer representations
class Molecule: 
  def __init__(self, molecule_names, label):
    self.molecule_names = [m.lower() for m in molecule_names]
    self.label = label 

    self.molecules_double_spaced = [" " + m + " " for m in self.molecule_names]

    self.primaryMoleculeName = self.molecule_names[0]
    if(len(self.molecule_names) > 1):
      self.nonPrimaryMoleculeNames = self.molecule_names[1:]
    else:
      self.nonPrimaryMoleculeNames = None 
  
  def getLabel(self):
    return self.label 

  def getMoleculeNames(self):
    return self.molecule_names

  def getMoleculeNamesDoubleSpaced(self):
    return self.molecules_double_spaced
  
  def isMoleculeName(self, word): 
    for m_name in self.molecule_names:
      if word == m_name: 
        return m_name 

  def getPrimaryMoleculeName(self):
    return self.primaryMoleculeName

  def getNonPrimaryMoleculeNames(self):
    return self.nonPrimaryMoleculeNames

  def __repr__(self):
    return f"Molecule Names: {self.getMoleculeNames()}\nMolecule Label: {self.getLabel()}"

known_molecules = [] 
for m_names, label in zip(molecule_names, molecule_labels):
  known_molecules.append(Molecule(m_names, label))

to_print = 5 
for i in range(to_print):
  print(known_molecules[i])

Molecule Names: ['netrin-1']
Molecule Label: 1
Molecule Names: ['epha', 'ephrin-a']
Molecule Label: 0
Molecule Names: ['ephb', 'ephrin-b']
Molecule Label: 0
Molecule Names: ['l1']
Molecule Label: 1
Molecule Names: ['laminin']
Molecule Label: 1


### Mask Known Molecules 

In [None]:
# 1. Go through each sentence and check if molecule is in that sentence 
# 2. If molecule is in that sentence assign that molecule with [MOL] 
# 3. Append the sentence to the dictionary of labelled sentences with the key 
# of the dictionary being the molecule's primary name. 
# NOTE: Set mask_other_molecules to True if you would like to assign other molecule names with [OTHER]


from collections import defaultdict

def remove_punctuation(sentence):
    # Define the pattern to match punctuation except "-"
    pattern = r"[^\w\s-]"
    # Remove punctuation from the sentence
    cleaned_sentence = re.sub(pattern, "", sentence)
    return cleaned_sentence

def mask_sentence(sentence, molecule_name, mask_token):
  # Mask molecules at beggining and middle of sentence 
  sentence = re.sub(r'\b' + re.escape(molecule_name) + r'\b', mask_token, sentence)
  # Mask molecules at end of sentence 
  sentence = re.sub(r'\b' + re.escape(molecule_name) + r'\b$', mask_token, sentence)

  return sentence

import string
run_code = False 
mask_other_molecules = False

if run_code: 
  labeled_sentences = defaultdict(str)

  primary_molecule_replace_word = "[MASK1]"
  secondary_molecule_replace_word = "[MASK2]"
  # Process each sentence
  # for sentence in reversed(all_docs_1D):
  for sentence in all_docs_1D:
      # Remove punctuation 
      sentence = remove_punctuation(sentence) 

      found_labels = set() # Check if the sentence contains only one type of molecule
      found_molecules_classes = list() # store molecule class found 
      for molecule in known_molecules:
          # Check per word for all molecule names 
          for word in sentence.split():
            # check to see if any of molecules names match 
            found_molecule_name = molecule.isMoleculeName(word)
            if found_molecule_name:
              # add the molecule class itself 
              found_molecules_classes.append(molecule)
              found_labels.add(molecule.getLabel())

      # Assign the label to the sentence if only one class of molecule is found
      if len(found_labels) == 1:
        print(f"Labeled Sentence: '{sentence.strip()}'   Molecules: {[m.getMoleculeNames() for m in found_molecules_classes]}")
        for index, molecule in enumerate(found_molecules_classes):
          # mask all molecule names for the given molecule with the first mask
          sentence_to_add = sentence
          for m_name in molecule.getMoleculeNames():
            sentence_to_add = mask_sentence(sentence_to_add, m_name, primary_molecule_replace_word)
          if mask_other_molecules:
            found_molecule_names_copy = found_molecule_names[:]
            found_molecule_names_copy.pop(index)
            for other_molecule_name in found_molecule_names_copy: 
              sentence = mask_sentence(sentence, other_molecule_name, secondary_molecule_replace_word)

          print(f"Labeled Sentence: '{sentence_to_add.strip()}'   Molecule: {molecule.getMoleculeNames()}")
          labeled_sentences[molecule.getPrimaryMoleculeName()] += sentence_to_add.strip() + ". "

  # Print the labeled sentences
  # for sentence, label in labeled_sentences.items():
  #     print(f"Labeled Sentence: '{sentence}'   Label: {label}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Labeled Sentence: 'if this is the case and if neurotrophins are able  to alter the neuronal response to [MASK1] and myelin then the question is whether this switch is dependent on intracellular camp levels'   Molecule: ['mag', 'myelin-associated glycoprotein']
Labeled Sentence: 'neurotrophins increased neuronal camp levels and this increase was prevented if mag was also present'   Molecules: [['mag', 'myelin-associated glycoprotein']]
Labeled Sentence: 'neurotrophins increased neuronal camp levels and this increase was prevented if [MASK1] was also present'   Molecule: ['mag', 'myelin-associated glycoprotein']
Labeled Sentence: 'further if neurons were primed with neurotrophins in the presence of a pka inhibitor the block of mag and myelin inhibition was completely abro gated'   Molecules: [['mag', 'myelin-associated glycoprotein']]
Labeled Sentence: 'further if neurons were primed with neurotrophins in the presence of a 

In [None]:
# print(labeled_sentences.keys())
# labeled_sentences['sdf-1']

dict_keys(['l1', 'ngr', 'mag', 'rock', 'lif', 'c3', 'sulfatides', 'laminin', 'bdnf', 'cntf', 'dine', 'igf-1', 'cspg', 'gdnf', 'tenascin', 'netrin-1', 'rhoa', 'epha', 'ephb', 'bmp4', 'omgp', 'pten', 'socs3', 'klf-4', 'klf-9', 'stat3', 'klf-6', 'rapamycin', 'taxol', 'tlr2', 'kspg', 'sdf-1', 'c-myc', 'klf-7', 'nogoa', 'zymosan', 'ocm', 'opn', 'b-raf', 'y27632', '(cpt)-camp'])


'plasma samples  for  measuring  [MASK1]  hgf   vegf   ang2. table 2 plasma concentrations of [MASK1] hgf and igf-1 in patients with sci at different time points 24 hours day 3 and day 7 and in healthy subjects. we sought to determine the plasma concentrations of [MASK1] and hgf which represent the main chemoat- tractive factors contributing to trafficking migration and homing of bm-derived spcs including  epcs  and subsequent vascular repair or angiogenesis at the vascular injury sitetable hgf plasma level was found to be markedly higher in patients with sci at each of time points 24 hours day 3 and day 7 compared to healthy subjects with peak  concentrat- ion at day 3 post-injury median 1635 vs 593 pgml  p  00001. surprisingly we found a significantly lower [MASK1] concentration in patients with sci during the first 3 days after acute  sci compared to healthy subjects median 1519 and 1539 vs 2228 pgml p  00001 for 24 hours and day 3 respectively. the [MASK1] levels returned to those 

In [None]:
# import json

# save_folder = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/Optic Nerve Regeneration/Code/Sentence Classification/Output/Combined_Sentences_Per_Molecule/"
# # Saving the dictionary to a file
# with open(save_folder + "masked_known_molecules.json", "w") as file:
#     json.dump(labeled_sentences, file)