# Getting graph from corpus
---



# Loading in all Sentences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/Extracted Sentences/"

In [None]:
import json

In [None]:
with open(MODEL_PATH + "all_years_array.json", "r") as f:
    all_docs = json.load(f)

In [None]:
len(all_docs)

700

In [None]:
all_docs_1D = [j for sub in all_docs for j in sub]

In [None]:
len(all_docs_1D)

755496

In [None]:
keyword = "pten"
found_sentences = [] 

for sentence in all_docs_1D:
  if keyword in sentence:
    found_sentences.append(sentence)

In [None]:
len(found_sentences)

660

# Defining Molecules and Labels

In [None]:
molecules = ["netrin", "ephrin", "laminin", "tenascin", "cspg", "zymosan", "camp", "pten", "cntf", "lif", "oncomodulin", "stat3", "socs3", "rhoa", "rock", "y27632", "nogo", "klf", "ngr", "lar", "tlr2", "bdnf", "igf1", "opn", "mag", "omgp", "kspg", "taxol"]
molecules_double_spaced = [" " + m + " " for m in molecules]
print(molecules_double_spaced)

[' netrin ', ' ephrin ', ' laminin ', ' tenascin ', ' cspg ', ' zymosan ', ' camp ', ' pten ', ' cntf ', ' lif ', ' oncomodulin ', ' stat3 ', ' socs3 ', ' rhoa ', ' rock ', ' y27632 ', ' nogo ', ' klf ', ' ngr ', ' lar ', ' tlr2 ', ' bdnf ', ' igf1 ', ' opn ', ' mag ', ' omgp ', ' kspg ', ' taxol ']


In [None]:
molecule_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1] 
len(molecules) == len(molecule_labels)
node_colors = ["green" if x==1 else "red" for x in molecule_labels]

In [None]:
import gensim 

MODEL_PATH_ALL = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/Gensim_Lemmatized_All_Docs/"
model_all_years = gensim.models.KeyedVectors.load(MODEL_PATH_ALL + "{}.wordvectors".format("All Papers"), mmap='r')
wordvec = model_all_years

molecules_embedded = [wordvec[m] for m in molecules]

In [None]:
def make_edges_from_dict(graph_dict):
  edges = [] 
  for molecule, assoc in graph_dict.items():
    for m in assoc:
        edges.append((molecule.strip(), m.strip()))

  
  return edges

In [None]:
!pip install pyvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import networkx as nx 
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure

def create_networkx_graph(edges, molecules=molecules, molecules_embedded=molecules_embedded):
  figure(figsize=(18, 14), dpi=80)

  G = nx.Graph()
  G.add_nodes_from(molecules, v=molecules_embedded)
  G.add_edges_from(edges)
  nx.draw(G)

In [None]:
from pyvis.network import Network

# Can't yet use embeddings for value attribute of pyvis graph
def create_pyvis_graph(edges, filename, molecules_arr=molecules, molecules_embedded=molecules_embedded, colors_arr=node_colors):
  nt = Network('1000px', '1000px')
  # populates the nodes and edges data structures
  nt.add_nodes(molecules_arr, color=node_colors)
  # nt.add_nodes(molecules_arr, value=molecules_embedded, color=node_colors)

  nt.add_edges(edges)
  nt.show("{}.html".format(filename))
  print("Now open {}.html".format(filename))

In [None]:
# TODO for nodes: 
## Use paragraphs, Use documents 
## Try with different than two spaces padding it 
## Add all molecules 
## Use Word2Vec Similarity - DONE 
## Use word embeddings - DONE 

## TODO for edges:
# strength of link using regeneration scores from combined y hats of BERT model for each sentence
# strength of link using regeneration scores from combined sum of causal verbs in each sentence - DONE
# strength of link from amount of vector similarity - DONE 

# Extracting Abbreviations

In [None]:
# Let's do by same sentence first 
all_docs_1D[100:110]

['april 19th, i divided the spinal marrow of a dog, between the last vertebra of the neck and first of the back.',
 'the muscles of the trunk of the body, but particularly those of the hind legs, appeared instantly relaxed; the legs continued supple, like those of an animal killed by electricity.',
 'the heart, on performing the operation, ceased for a stroke or two, then went on slow and full, and in about a q\\.tarter of an hour after, the pulse was 160 in a minute.',
 'respiration was performed by means of the diaphragm only, which acted very strongly for some hours.',
 'the operation was performed about a quarter of an hour before twelve at noon; about four in the afternoon the pulse was ninety only in a minute, and the heat of the body exceedingly abated, the diaphragm acting strongly, but irre\xad gularly.',
 'about seven in the evening, the pulse was not above twenty in a minute, the diaphragm acting strongly, but in re\xad peated jerks.',
 'between twelve at night and one in th

In [None]:
!pip install scispacy 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz (120.2 MB)


In [None]:
import spacy
import scispacy 
import en_core_sci_md 
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_md")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

<scispacy.abbreviation.AbbreviationDetector at 0x7f1fb13baf50>

In [None]:
all_docs_1_sentence = ' '.join(all_docs_1D)

In [None]:
import math 
import pickle 

molecules_NER = {}  

chunk_length = 1_000_000
num_iterations = math.ceil(len(all_docs_1_sentence) / chunk_length)
start_idx = 39_000_000
counter = 39

while counter < num_iterations:
    print(f'Chunk {counter}/{num_iterations}...')
    chunk = all_docs_1_sentence[start_idx:start_idx + chunk_length]
    if counter == num_iterations - 1:     # final chunk 
      chunk = all_docs_1_sentence[start_idx:len(all_docs_1_sentence) - 1]

    doc = nlp(chunk)
    done = [] 
    for abrv in doc._.abbreviations:
      if abrv._.long_form not in done:
        print(f"{abrv} \t {abrv._.long_form}")
        done.append(abrv._.long_form)
        molecules_NER[abrv.text] = abrv._.long_form.text

    start_idx = start_idx + chunk_length
    counter += 1

    # save after each one because you don't know when RAM will run out 
    NER_ABBRV = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/GraphSage/ner_abbreviations_39_49.pkl"
    with open(NER_ABBRV, 'wb') as f:
      pickle.dump(molecules_NER, f)

    # free up some space 
    del doc 

Chunk 39/49...
cnnr 	 cellular neuroscience, neurodegeneration, and repair
cns 	 central system
cspgs 	 chondroitin sulfate proteoglycans
s1pr2 	 sphingosine-1–phosphate receptor 2
als 	 amyotrophic lateral sclerosis
ltp 	 long-term potentiation
rock 	 rho-associated kinase
sci 	 spinal cord injury
mif 	 migration inhibitory factor
tbi 	 traumatic brain injury
fgf 	 fibroblast growth factor
lpa 	 lysophosphatidic acid
tcr 	 t cell receptor
ros 	 reactive oxygen species
trks 	 tropomyosin-related kinase receptors
berl 	 by a massive macrophage/microglial response. anat embryol
red 	 retrograde tracing with dtmr
nscs 	 neural 
stem cells
nts 	 neurotrophins
ngf 	 nerve growth factor
nt-3 	 neurotrophin-3
sf 	 silk fibroin
pcla 	 poly(e-caprolactone)-block-poly(l- lactic acid-co-e-caprolactone)
scaffolds 	 scaffolds (conduits
upw 	 ultra- purified water
sd 	 sprague-dawley
gfp 	 green fluorescent protein
hbss 	 hanks’ balanced salt solution
sigma- aldrich 	 sigma-al- drich, st louis, mo, 

In [None]:
stop_code

In [None]:
graph_dict_sentence = {} 
for molecule in molecules_double_spaced:
  graph_dict_sentence[molecule.strip()] = set()

counter = 0
for sentence in all_docs_1D:
  molecules_in_sentence = [] 
  for molecule in molecules_double_spaced:
    if molecule in sentence:
      molecules_in_sentence.append(molecule.strip())
  
  for found_molecule in molecules_in_sentence:
    to_add = molecules_in_sentence
    to_add.remove(found_molecule)
    if to_add != []:
      print("---")
      print(sentence)
      print(to_add) 
    molec_assoc = graph_dict_sentence[found_molecule]
    molec_assoc.update(to_add)
    graph_dict_sentence[found_molecule] = molec_assoc

  # counter += 1
  # if counter == 10000:
  #   break

print(graph_dict_sentence)

In [None]:
print(graph_dict_sentence)

In [None]:
edges_sentence = make_edges_from_dict(graph_dict_sentence)

In [None]:
edges_sentence

In [None]:
create_pyvis_graph(edges_sentence, "same_sentence")