This BERT model was train on an unbalanced dataset. Please see BERT Training Balanced Split.ipynb for updated training

# Data Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/Extracted Sentences/"

In [None]:
import json

In [None]:
with open(MODEL_PATH + "all_years_array.json", "r") as f:
    all_docs = json.load(f)

In [None]:
len(all_docs)

700

In [None]:
all_docs_1D = [j for sub in all_docs for j in sub]

In [None]:
len(all_docs_1D)

755496

In [None]:
orig_molecules = ["netrin", "ephrin", "laminin", "tenascin", "cspg", "zymosan", "camp", "pten", "cntf", "lif", "oncomodulin", "stat3", "socs3", "rhoa", "rock", "y27632", "nogo", "klf", "ngr", "lar", "tlr2", "bdnf", "igf1", "opn", "mag", "omgp", "kspg", "taxol"]

In [None]:
molecule_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1] 
label_dict = {k:v for (k,v) in zip(orig_molecules, molecule_labels)}

# Generating Supervised Sentences only from Orig Molecules


In [None]:
to_remove = orig_molecules.index("lar")
orig_molecules.pop(to_remove) 
molecule_labels.pop(to_remove)

0

In [None]:
orig_molecules

['netrin',
 'ephrin',
 'laminin',
 'tenascin',
 'cspg',
 'zymosan',
 'camp',
 'pten',
 'cntf',
 'lif',
 'oncomodulin',
 'stat3',
 'socs3',
 'rhoa',
 'rock',
 'y27632',
 'nogo',
 'klf',
 'ngr',
 'tlr2',
 'bdnf',
 'igf1',
 'opn',
 'mag',
 'omgp',
 'kspg',
 'taxol']

In [None]:
label_dict = {k:v for (k,v) in zip(orig_molecules, molecule_labels)}
label_dict

{'netrin': 1,
 'ephrin': 0,
 'laminin': 1,
 'tenascin': 0,
 'cspg': 0,
 'zymosan': 1,
 'camp': 1,
 'pten': 0,
 'cntf': 1,
 'lif': 1,
 'oncomodulin': 1,
 'stat3': 1,
 'socs3': 0,
 'rhoa': 0,
 'rock': 0,
 'y27632': 1,
 'nogo': 0,
 'klf': 1,
 'ngr': 0,
 'tlr2': 1,
 'bdnf': 0,
 'igf1': 1,
 'opn': 1,
 'mag': 0,
 'omgp': 0,
 'kspg': 0,
 'taxol': 1}

In [None]:
# make sure to initialize this AFTER removing molecules with no embeddings
orig_molecules_double_spaced = [" " + m + " " for m in orig_molecules]
print(orig_molecules_double_spaced)

[' netrin ', ' ephrin ', ' laminin ', ' tenascin ', ' cspg ', ' zymosan ', ' camp ', ' pten ', ' cntf ', ' lif ', ' oncomodulin ', ' stat3 ', ' socs3 ', ' rhoa ', ' rock ', ' y27632 ', ' nogo ', ' klf ', ' ngr ', ' tlr2 ', ' bdnf ', ' igf1 ', ' opn ', ' mag ', ' omgp ', ' kspg ', ' taxol ']


In [None]:
from collections import defaultdict
# 0 index will be num of inihibitors, 1st index will be number of promoters 
trainable_sentences = defaultdict(int)
for counter, sentence in enumerate(all_docs_1D):
  promoter = False 
  inhibitor = False
  for molecule in orig_molecules_double_spaced:
    if molecule in sentence:
      molecule_label = label_dict[molecule.strip()]
      if molecule_label == 0:
        inhibitor = True 
      else:
        promoter = True 

  to_label = 0 
  if promoter and inhibitor:
    continue 
  if not promoter and not inhibitor:
    continue
  if promoter and not inhibitor:
    to_label = 1
  elif not promoter and inhibitor:
    to_label = 0 

  print(f"{counter}/{len(all_docs_1D)}")
  print(f"--> P:{promoter} and I:{inhibitor} found in {sentence.strip()}")
  print(f"--> Labeling sentence as {to_label}")
  trainable_sentences[sentence.strip()] = to_label

trainable_sentences

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--> Labeling sentence as 2
364994/755496
--> P:False and I:False found in the
--> Labeling sentence as 2
364995/755496
--> P:False and I:False found in tibial
--> Labeling sentence as 2
364996/755496
--> P:False and I:False found in epiphysis
--> Labeling sentence as 2
364997/755496
--> P:False and I:False found in there
--> Labeling sentence as 2
364998/755496
--> P:False and I:False found in trace
--> Labeling sentence as 2
364999/755496
--> P:False and I:False found in clear	line which
--> Labeling sentence as 2
365000/755496
--> P:False and I:False found in on the late
--> Labeling sentence as 2
365001/755496
--> P:False and I:False found in brit journ
--> Labeling sentence as 2
365002/755496
--> P:False and I:False found in surg
--> Labeling sentence as 2
365003/755496
--> P:False and I:False found in vol
--> Labeling sentence as 2
365004/755496
--> P:False and I:False found in transplantation
--> Labeling sentence a

# Training BioBert

In [None]:
!pip install transformers

In [None]:
## Model.py ## 

import transformers, torch
import torch.nn as nn
import torch.nn.functional as F

tokenizer = transformers.RobertaTokenizerFast.from_pretrained("roberta-base")

def _tokenize(st):
    tokenized_st = tokenizer(
        st,
        max_length=64,
        padding="max_length",
        truncation=True,
    )
    return tokenized_st


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)


class TrainingModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.encoder = transformers.RobertaModel.from_pretrained("roberta-base")
        self.linear = nn.Linear(self.encoder.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, batch):
        encoder_out = self.encoder(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )
        logits = self.linear(encoder_out["pooler_output"])
        output = self.softmax(logits)
        result = {"output": output}
        if "labels" in batch:
            result["loss"] = F.cross_entropy(output, batch["labels"])
        return result


class InferenceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = transformers.RobertaModel.from_pretrained("roberta-base")
        self.linear = nn.Linear(self.encoder.config.hidden_size, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, tokenized_comment):
        encoder_out = self.encoder(
            input_ids = torch.as_tensor(tokenized_comment["input_ids"]).unsqueeze(0).to(DEVICE),
            attention_mask = torch.as_tensor(tokenized_comment["attention_mask"]).unsqueeze(0).to(DEVICE),
        )
        logits = self.linear(encoder_out["pooler_output"])
        return self.softmax(logits)


class Predict:
    def __init__(self, model_name):
        self.model = InferenceModel()
        self.model.load_state_dict(torch.load(model_name, map_location=DEVICE))
        self.model = self.model.to(DEVICE)
        self.model.eval()

    def process_eval(self, s):
        # s = strip_tags(s)
        tokens = _tokenize(s)
        return tokens


    def predict(self, comment):
        tokenized_comment = self.process_eval(comment)
        with torch.no_grad():
            logits = self.model(tokenized_comment)
        confidence = float(logits[0][1])
        return logits, confidence

In [None]:
from torch.utils.data import DataLoader, Dataset

class Data(Dataset):
# class Data(list):
    def __init__(self, X, y):
        assert len(X) == len(y)
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return_item = _tokenize(self.X[idx])
        return_item["label"] = self.y[idx]

        return return_item

In [None]:
trainable_sentences.keys()

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

features_arr = list(trainable_sentences.keys())
targets_arr = list(trainable_sentences.values())
X_train, X_test, y_train, y_test = train_test_split(features_arr, targets_arr, test_size=0.20)

y_train = [int(x) for x in y_train]
y_test = [int(x) for x in y_test]

train_dataset = Data(X_train, y_train)
val_dataset = Data(X_test, y_test)

In [None]:
## train.py ## 

import numpy as np
import torch
from transformers import AdamW
from tqdm import tqdm
# from Model import TrainingModel
import transformers
from sklearn.metrics import roc_auc_score
# from plot_results import plot


num_epochs = 10
DOWNSAMPLE = True
NARROW = None #"UCOMrUmOTPD_AnSivjxptxpA" #Louis
IGNORE_WHALE = False
model_tag = 'thoughtful_channels'


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

def to_device(batch):
    return {k: v.to(DEVICE) for k, v in batch.items()}


# train_dataloader, val_dataloader = get_data_main(NARROW, DOWNSAMPLE, IGNORE_WHALE)
train_dataloader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=transformers.data.DataCollatorWithPadding(tokenizer),
    )

val_dataloader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=transformers.data.DataCollatorWithPadding(tokenizer),
)

model = TrainingModel(num_classes=2).to(DEVICE)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,  
    num_training_steps=len(train_dataloader) * num_epochs,
)


all_auc, all_val_loss, all_train_loss = [], [], []
for epoch in range(num_epochs):
    train_loss_ls = []
    model.train()
    for batch in tqdm(train_dataloader, desc=f"train e_{epoch}"):
        optimizer.zero_grad()
        out = model(to_device(batch))
        out["loss"].backward()

        optimizer.step()
        scheduler.step()
        train_loss_ls.append(out["loss"].item())

    optimizer.zero_grad()
    train_loss = np.mean(train_loss_ls)

    # validation
    model.eval()
    val_loss_ls, y_hat, y_true = [], [], []
    for batch in tqdm(val_dataloader, desc="val"):
        batch = to_device(batch)
        with torch.inference_mode():
            out = model(batch)
        
        output = out['output'].cpu().numpy()
        y_hat.append(output[:, 1]) ## confidence for a positive sample
        
        # preds = out["output"].argmax(-1).detach()
        # y_pred.append(preds.cpu().float().numpy())
        y_true.append(batch["labels"].cpu().float().numpy())
        val_loss_ls.append(out["loss"].item())

    y_true, y_hat = np.concatenate(y_true), np.concatenate(y_hat)
    auc = roc_auc_score(y_true, y_hat) if len(np.unique(y_true)) > 1 else -1
    val_loss = np.mean(val_loss_ls)

    all_auc.append(auc)
    all_val_loss.append(val_loss)
    all_train_loss.append(train_loss)

    print(f"Epoch {epoch}: AUC={auc:.2f}, Train Loss={train_loss:.2f}, Val Loss={val_loss:.2f}\n")

In [None]:
all_auc

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Linguistic Causation/Roberta Model/Saved Models/Supervised_10_epochs.pth")

In [None]:
import matplotlib.pyplot as plt
plt.plot(list(range(num_epochs)), all_auc) 

In [None]:
plt.plot(list(range(num_epochs)), all_val_loss) 

In [None]:
plt.plot(list(range(num_epochs)), all_train_loss)

In [None]:
len(features_arr)

In [None]:
print(list(trainable_sentences.values()).count(1))
print(list(trainable_sentences.values()).count(0))

In [None]:
stop_code

In [None]:
p = Predict("/content/drive/MyDrive/Linguistic Causation/Roberta Model/Saved Models/Supervised_10_epochs.pth") 
# model.load_state_dict(torch.load(bert_model_path))
# model.eval()

In [None]:
p.predict("with camp we can accelerate axon growth") 

In [None]:
p.predict("we show that mag is a strong inhibitor") 

# Validating Sentences can be Separated

In [None]:
orig_molecules = ["netrin", "ephrin", "laminin", "tenascin", "cspg", "zymosan", "camp", "pten", "cntf", "lif", "oncomodulin", "stat3", "socs3", "rhoa", "rock", "y27632", "nogo", "klf", "ngr", "lar", "tlr2", "bdnf", "igf1", "opn", "mag", "omgp", "kspg", "taxol"]

In [None]:
molecule_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1] 
label_dict = {k:v for (k,v) in zip(orig_molecules, molecule_labels)}

In [None]:
import pandas as pd
new_molecules_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP - Lab/GraphSage/Named_Entity_Recognition/Abrv_Curated.csv", header=None)
new_molecules_df

In [None]:
molecules = orig_molecules + list(set(new_molecules_df.loc[:,0]))

In [None]:
molecules = list(set(molecules))

In [None]:
new_molecules_df[new_molecules_df[0] == 'if']

In [None]:
molecules.remove("if")
molecules.remove("of")
molecules.remove("am")
molecules.remove("fat")
molecules.remove("th")
molecules.remove("th-")
molecules.remove("m.")
molecules.remove("set")
molecules.remove("protein")

In [None]:
import numpy as np
# ensure no duplicates 
val = np.array(molecules) 
ii = np.where(val == "hrp")[0]
ii

In [None]:
molecule_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1] 
len(molecules) == len(molecule_labels)
node_colors = ["green" if x==1 else "red" for x in molecule_labels]

In [None]:
import gensim 

MODEL_PATH_ALL = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/Gensim_Lemmatized_All_Docs/"
model_all_years = gensim.models.KeyedVectors.load(MODEL_PATH_ALL + "{}.wordvectors".format("All Papers"), mmap='r')
wordvec = model_all_years

molecules_embedded = [] 
for m in list(molecules): # create copy so removing doesn't mess with iterator 
  try:
    molecules_embedded.append(wordvec[m])
  except KeyError:
    molecules.remove(m)

In [None]:
print(len(molecules))
print(len(molecules_embedded))

In [None]:
# make sure to initialize this AFTER removing molecules with no embeddings
orig_molecules_double_spaced = [" " + m + " " for m in orig_molecules]
print(orig_molecules_double_spaced)

In [None]:
from collections import defaultdict
# 0 index will be num of inihibitors, 1st index will be number of promoters 
found_sentences = defaultdict(list)
for counter, sentence in enumerate(all_docs_1D):
  for molecule in orig_molecules_double_spaced:
    if molecule in sentence:
      print(f"{counter}/{len(all_docs_1D)}")
      print(f"--> {molecule.strip()} adding to {label_dict[molecule.strip()]}th index")
      cur_list = found_sentences[sentence.strip()]
      if cur_list == []:
        cur_list = [0,0]
      cur_list[label_dict[molecule.strip()]] += 1 
      found_sentences[sentence.strip()] = cur_list

In [None]:
sentence_df = pd.DataFrame(found_sentences).T
sentence_df.iloc[1000:1010]

In [None]:
import matplotlib.pyplot as plt

In [None]:
sentence_df.columns = ['Num Inhibitors', 'Num Promoters']
sentence_df

In [None]:
sentence_df['Num Promoters'].values

In [None]:
num_inhibs = sentence_df['Num Inhibitors'].values
num_promote = sentence_df['Num Promoters'].values

In [None]:
size_dict = defaultdict(int)
for x,y in zip(num_inhibs, num_promote):
  size_dict[(x,y)] += 1

In [None]:
sum(size_dict.values())

In [None]:
len(num_promote)

In [None]:
size_dict

In [None]:
total = 0
for k,v in size_dict.items():
  x,y = k
  if x == 0 or y == 0: 
    total += v 

print(f"Total non-overlap sentences: {total}/{sum(size_dict.values())} = {total/sum(size_dict.values()) * 100}%")

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(7, 7))  

x,y = zip(*size_dict.keys())
ax.scatter(x, y, s=list(size_dict.values())) 

# ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
plt.title(
    "Visualization of Promtoers and Inhibitors Per Sentence"
)
plt.xlabel("Num Inhibitors")
plt.ylabel("Num Promoters")
plt.show()

In [None]:
stop_code

# Creating Supervised Sentences from Total Corpus by Extracting Promoters and Inhibitors


In [None]:
x = [[],[]]
x[0].append(1)
x

In [None]:
from collections import defaultdict
found_sentences = defaultdict(list)
for counter, sentence in enumerate(all_docs_1D):
  for molecule in orig_molecules_double_spaced:
    if molecule in sentence:
      cur_list = found_sentences[sentence.strip()] 
      if cur_list == []:
        cur_list = [[],[]]
      if label_dict[molecule.strip()] == 0:
        cur_list[0].append(molecule.strip() + " (inhibitor) ")
      elif label_dict[molecule.strip()] == 1:
        cur_list[1].append(molecule.strip() + " (promoter) ")

      found_sentences[sentence.strip()] = cur_list

In [None]:
found_sentences

In [None]:
supervised_df = pd.DataFrame(columns=["Sentence", "Inhibitors", "Promoters"]) 
counter = 0 
for k,v in found_sentences.items():
  supervised_df.loc[counter] = [k, " ".join(v[0]), " ".join(v[1])]
  counter += 1 

supervised_df

In [None]:
supervised_df.to_csv("/content/drive/MyDrive/Colab Notebooks/NLP - Lab/GraphSage/SupervisedLearning/FoundSentences.csv")

In [None]:
# import csv

# with open("/content/drive/MyDrive/Colab Notebooks/NLP - Lab/GraphSage/SupervisedLearning/FoundSentences.csv", 'w') as f:  
#     for k,v in found_sentences.items():
#       f.write(k.replace(",", "") + "," + str(v).replace("[", "").replace("]", "").replace(",", "").replace("'","") + "\n")