In [0]:
#The purpose of the notebook is to generate embeddings using the pre trained BERT model.
#AutoEncoders utilised for merging and other purpose are defined and trained in this notebook only.
#We have utilised 3 BERT model here, BioBERT, SciBERT and Clinical BERT
#Towards end of the notebook we have defined code for merging of non-linear graph representation of users with text based representation.
#Note: none of the embedding evaluation occurs in the notebook, rather a matrix is generated and saved locally to carry out at local machine. 
#Reason being colab only supports 8 hours per day usage and if disconnected the variables loses their value, retraining encoder is time consuming.

In [0]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('popular')
from scipy import spatial
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
lemmatizer = WordNetLemmatizer()

In [0]:
df = pd.read_excel('/content/drive/My Drive/Colab Notebooks/20200507_Cardiology_HCP_Data.xlsx')

In [0]:
Embedding_Column_Name = ["AUTHOR_ID", "TRANS_AUTHOR_BIO","Account Type","Snippet"]
Main_DF = df[Embedding_Column_Name]
Main_DF = Main_DF.groupby('AUTHOR_ID',as_index= False)["TRANS_AUTHOR_BIO","Account Type","Snippet"].agg(lambda x: list(x))

In [0]:
#### Making a cleaner dataframe ####
for columName in Embedding_Column_Name[1:]:
    for _,x in Main_DF.iterrows():
        x[columName] = list(set(x[columName]))

In [0]:
Main_UsrDoc_df = Main_DF.copy()

In [0]:
################ Cleaning the tweets #################
# lowercase
# remove http
# remove punctuations
# remove @
# remove #
# remove stopwords or lemmatize

def RemoveHTTP(tweet):
  clean_tweet = re.match('(.*?)https.*?\s?(.*?)',str(tweet))
  if(clean_tweet):
    return clean_tweet.group(1)
  else:
    return tweet

def removePunc(inputstr):
  for x in inputstr.lower():
    if x in punctuations:
      inputstr = inputstr.replace(x, "")
  return inputstr

def Clean(tweet):
  clean_tweet = re.sub('@','',str(tweet))
  clean_tweet = re.sub('#','',str(clean_tweet))
  clean_tweet = re.sub('RT','',str(clean_tweet))
  clean_tweet = removePunc(clean_tweet)
  return clean_tweet
  #tweet = tweet.strip()

def removeStopWordsandLemmatize(inputstr):
  token = word_tokenize(inputstr)
  result = [i for i in token if not i in stop_words]
  result = [lemmatizer.lemmatize(i) for i in result]
  return ' '.join(result)

In [0]:
!pip install biobert-embedding==0.1.2
from biobert_embedding.embedding import BiobertEmbedding
biobert_model = BiobertEmbedding()

In [0]:
################### Creating Training data and training AutoEncoder 1536 to 768 ##################

In [0]:
def CompleteCleaning(tweet):
  clean_tweet = RemoveHTTP(tweet)
  clean_tweet = Clean(clean_tweet)
  clean_tweet = removeStopWordsandLemmatize(clean_tweet)
  return clean_tweet

In [0]:
train_df = Main_UsrDoc_df.tail(2500) # since we will be evaluating with top 1500 user data

In [0]:
BuildCorpus = []
for _,x in train_df.iterrows():
  BuildCorpus.append(x['Snippet'])

In [0]:
BuildCorpus_New = []
for tweetList in BuildCorpus:
  for tweet in tweetList:
    print(tweet)
    BuildCorpus_New.append(tweet)

In [0]:
for i in range(len(BuildCorpus_New)):
  BuildCorpus_New[i] = CompleteCleaning(BuildCorpus_New[i])

In [0]:
TrainMat1536 = np.zeros(1536)
for i in range(0,len(BuildCorpus_New),2):
  vec = []
  j = i + 1
  emb_i = biobert_model.sentence_vector(BuildCorpus_New[i])
  emb_j = biobert_model.sentence_vector(BuildCorpus_New[j])
  vec = np.concatenate((vec,emb_i), axis=None)
  vec = np.concatenate((vec,emb_j), axis=None)
  TrainMat1536 = np.vstack((TrainMat1536,vec))
  print("Epoch " + str(i) + " / " + str(len(BuildCorpus_New)))

In [0]:
TrainMat1536.shape

In [0]:
############### creating Auto Encoder for 1536 to 768 ################

In [0]:
np.random.shuffle(TrainMat1536)
training_data, test_data = TrainMat1536[:1900,:],TrainMat1536[1900:,:]   #train and test data for AutoEncoder

In [0]:
import tensorflow as tf
import keras 
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input
from keras.layers import BatchNormalization,Activation
from keras.optimizers import Adam
from keras.losses import mean_squared_error

In [0]:
#defining AutoEncoder
encoding_dims = 768
input_dim = Input(shape = (1536,))
encoded = Dense(encoding_dims, activation='relu')(input_dim)
decoded = Dense(1536, activation='tanh')(encoded)
autoencoder_1536to768 = Model(input_dim,decoded)

encoder_1536to768 = Model(input_dim,encoded)

encoded_input = Input(shape=(encoding_dims,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder_1536to768.layers[-1]
# create the decoder model
decoder_1536to768 = Model(encoded_input, decoder_layer(encoded_input))

mse = tf.keras.losses.MeanSquaredError()
autoencoder_1536to768.compile(optimizer='adadelta', loss=mse)

In [0]:
autoencoder_1536to768.fit(training_data, training_data,
                epochs=500,
                batch_size=250,
                shuffle=True,
                validation_data=(test_data, test_data))

In [0]:
train_df['CombinedEmbedding'] = ""  # Adding additional column to store the generated vector

In [0]:
# function that accepts list of tweets and return an embedding using the userdoc approach only -- AutoEncoders are mainly used in user doc approach
# We use Bio BERT to generate sentence embedding
def generateSnippetEmbeding(tweetList):
  tweetEmbed = []
  for tweet in tweetList:
    emb = biobert_model.sentence_vector(CompleteCleaning(tweet))
    tweetEmbed.append(emb)
  mainvec = tweetEmbed[0]
  if(len(tweetList) > 1):  
    for i in range(1,len(tweetEmbed)):
      vec = np.concatenate((mainvec,tweetEmbed[i]), axis = None)
      fmax = vec
      fmax = np.vstack((fmax,vec))
      emb = encoder_1536to768.predict(fmax)
      mainvec = emb[0]
  return np.asarray(mainvec)     


In [0]:
train_df['SnippetEmbedding'] = "" # adding column to accept tweet embedding

In [0]:
train_df['SnippetEmbedding'] = [generateSnippetEmbeding(x['Snippet']) for _,x in train_df.iterrows()]

In [0]:
train_df['TranAuthorEmb'] = "" #adding column to accept user about info embedding

In [0]:
train_df = train_df.dropna()

In [0]:
count = 1
for _,x in train_df.iterrows():
  x['SnippetEmbedding'] = generateSnippetEmbeding(x['Snippet'])
  print("Epoch " + str(count) + " / 4000")
  count = count + 1

In [0]:
count = 1
for _,x in train_df.iterrows():
  x['TranAuthorEmb'] = np.asarray(biobert_model.sentence_vector(CompleteCleaning(x['TRANS_AUTHOR_BIO'])))
  print("Epoch "+str(count) + " / 2500")
  count = count + 1

In [0]:
train_df['TranAuthorEmb'][4000].shape

In [0]:
############ Train Matrix for Combining the Embedding that is 1536 to 300 dimension ###########

In [0]:
Mat1536to300 = np.zeros(1536)
for _,x in train_df.iterrows():
  vec = []
  vec = np.concatenate((vec,x['SnippetEmbedding']), axis = None)
  vec = np.concatenate((vec,x['TranAuthorEmb']), axis= None)
  Mat1536to300 = np.vstack((Mat1536to300,vec))

In [0]:
Mat1536to300 = Mat1536to300[1:,:]

In [0]:
np.random.shuffle(Mat1536to300)
training_data, test_data = Mat1536to300[:1900,:],Mat1536to300[1900:,:]

In [0]:
encoding_dims = 300
input_dim = Input(shape = (1536,))
encoded = Dense(encoding_dims, activation='relu')(input_dim)
decoded = Dense(1536, activation='tanh')(encoded)
autoencoder_1536to300 = Model(input_dim,decoded)

encoder_1536to300 = Model(input_dim,encoded)

encoded_input = Input(shape=(encoding_dims,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder_1536to300.layers[-1]
# create the decoder model
decoder_1536to300 = Model(encoded_input, decoder_layer(encoded_input))

mse = tf.keras.losses.MeanSquaredError()
autoencoder_1536to300.compile(optimizer='adadelta', loss=mse)

In [0]:
autoencoder_1536to300.fit(training_data, training_data,
                epochs=500,
                batch_size=250,
                shuffle=True,
                validation_data=(test_data, test_data))

In [0]:
#Saving the trained Auto Encoders for later use
from google.colab import files
autoencoder_1536to300.save('/content/drive/My Drive/Colab Notebooks/tweets/autoencoder_1536to300.h5')
decoder_1536to300.save('/content/drive/My Drive/Colab Notebooks/tweets/decoder_1536to300.h5')
encoder_1536to300.save('/content/drive/My Drive/Colab Notebooks/tweets/encoder_1536to300.h5')
autoencoder_1536to768.save('/content/drive/My Drive/Colab Notebooks/tweets/autoencoder_1536to768.h5')
encoder_1536to768.save('/content/drive/My Drive/Colab Notebooks/tweets/encoder_1536to768.h5')
decoder_1536to768.save('/content/drive/My Drive/Colab Notebooks/tweets/decoder_1536to768.h5')

In [0]:
#This function accepts a user data and generate embedding using the autoencoder and bert trained model, Finally the resultant vector of 1536 is 
# reduced to form a reduced version of user embedding
def Generate_CombinedEmbedding(rowd):
  snipetEmb = generateSnippetEmbeding(rowd['Snippet'])
  transAuthEmb = np.asarray(biobert_model.sentence_vector(CompleteCleaning(rowd['TRANS_AUTHOR_BIO'])))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  resultantvec = np.zeros(1536)
  resultantvec = np.vstack((resultantvec,vec))
  emb = encoder_1536to300.predict(resultantvec)
  return np.asarray(emb[1])

In [0]:
Main_UsrDoc_df['CombinedEmbedding'] = ""

In [0]:
count = 1
for _,x in Main_UsrDoc_df.iterrows():
  x['CombinedEmbedding'] = Generate_CombinedEmbedding(x)
  print("Epoch "+str(count) + " / " + str(Main_UsrDoc_df.shape[0]))
  count = count + 1

In [0]:
# Generating matrix for reduced dimension to be evaluated locally
UserDocTop900Emb_BioBert = np.zeros(300)
count = 1
for _,x in Main_UsrDoc_df.head(900).iterrows():
  UserDocTop900Emb_BioBert = np.vstack((UserDocTop900Emb_BioBert,np.asarray(x['CombinedEmbedding'])))
  print("Epoch " + str(count) + " / 900")
  count = count + 1

In [0]:
UserDocTop900Emb_BioBert.shape

In [0]:
UserDocTop900Emb_BioBert = UserDocTop900Emb_BioBert[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/BioBERT_UserDoc_Combined_Matrix.txt'
np.savetxt(fileName,UserDocTop900Emb_BioBert,fmt='%.8f')

In [0]:
def Generate_1536Embedding(rowd):
  snipetEmb = generateSnippetEmbeding(rowd['Snippet'])
  transAuthEmb = np.asarray(biobert_model.sentence_vector(CompleteCleaning(rowd['TRANS_AUTHOR_BIO'])))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  return np.asarray(vec)

In [0]:
# Generating matrix for full dimension to be evaluated locally
TransMat1536_full = np.zeros(1536)
count = 1
for _,x in Main_UsrDoc_df.head(1000).iterrows():
  TransMat1536_full = np.vstack((TransMat1536_full,Generate_1536Embedding(x)))
  print("Epoch "+ str(count) + " / 1000")
  count = count + 1

In [0]:
TransMat1536_full.shape

In [0]:
TransMat1536_full = TransMat1536_full[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/BioBERT_UserDoc_Full_Matrix.txt'
np.savetxt(fileName,TransMat1536_full,fmt='%.8f')

In [0]:
############################# Trying Sci BERT ########################## 

In [0]:
!pip install transformers
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [0]:
def GenerateEmbSCiBERT(text):
  input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
  output = model(input_ids)
  d1 = output[0].mean(1).detach().numpy()
  return d1

In [0]:
def generateSnippetEmbeding_Sci(tweetList):
  tweetEmbed = []
  for tweet in tweetList:
    emb = GenerateEmbSCiBERT(CompleteCleaning(tweet))
    tweetEmbed.append(emb)
  mainvec = tweetEmbed[0]
  if(len(tweetList) > 1):  
    for i in range(1,len(tweetEmbed)):
      vec = np.concatenate((mainvec,tweetEmbed[i]), axis = None)
      fmax = vec
      fmax = np.vstack((fmax,vec))
      emb = encoder_1536to768.predict(fmax)
      mainvec = emb[0]
  return np.asarray(mainvec)     


In [0]:
def Generate_CombinedEmbedding_Sci(rowd):
  snipetEmb = generateSnippetEmbeding_Sci(rowd['Snippet'])
  transAuthEmb = GenerateEmbSCiBERT(CompleteCleaning(rowd['TRANS_AUTHOR_BIO']))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  resultantvec = np.zeros(1536)
  resultantvec = np.vstack((resultantvec,vec))
  emb = encoder_1536to300.predict(resultantvec)
  return np.asarray(emb[1])

In [0]:
def Generate_1536Embedding_Sci(rowd):
  snipetEmb = generateSnippetEmbeding_Sci(rowd['Snippet'])
  transAuthEmb = GenerateEmbSCiBERT(CompleteCleaning(rowd['TRANS_AUTHOR_BIO']))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  return np.asarray(vec)

In [0]:
SciBertMat_300 = np.zeros(300)
count = 1
for _,x in Main_UsrDoc_df.head(1000).iterrows():
    SciBertMat_300 = np.vstack((SciBertMat_300,Generate_CombinedEmbedding_Sci(x)))
    print("Epoch "+ str(count) + " / 1000")
    count = count + 1

In [0]:
SciBertMat_300 = SciBertMat_300[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/SciBERT_UserDoc_Combined_Matrix.txt'
np.savetxt(fileName,SciBertMat_300,fmt='%.8f')

In [0]:
SciBertMat_1536 = np.zeros(1536)
count = 1
for _,x in Main_UsrDoc_df.head(1000).iterrows():
    SciBertMat_1536 = np.vstack((SciBertMat_1536,Generate_1536Embedding_Sci(x)))
    print("Epoch "+ str(count) + " / 1000")
    count = count + 1

In [0]:
SciBertMat_1536 = SciBertMat_1536[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/SciBERT_UserDoc_1536_Matrix.txt'
np.savetxt(fileName,SciBertMat_1536,fmt='%.8f')

In [0]:
################# Clinical BERT #####################

In [0]:
tokenizer_clinical = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model_clinical = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [0]:
def GenerateEmbClinicalBERT(text):
  input_ids = torch.tensor(tokenizer_clinical.encode(text)).unsqueeze(0)
  output = model_clinical(input_ids)
  d1 = output[0].mean(1).detach().numpy()
  return d1

In [0]:
def generateSnippetEmbeding_Clinical(tweetList):
  tweetEmbed = []
  for tweet in tweetList:
    emb = GenerateEmbClinicalBERT(CompleteCleaning(tweet))
    tweetEmbed.append(emb)
  mainvec = tweetEmbed[0]
  if(len(tweetList) > 1):  
    for i in range(1,len(tweetEmbed)):
      vec = np.concatenate((mainvec,tweetEmbed[i]), axis = None)
      fmax = vec
      fmax = np.vstack((fmax,vec))
      emb = encoder_1536to768.predict(fmax)
      mainvec = emb[0]
  return np.asarray(mainvec)     


In [0]:
def Generate_CombinedEmbedding_Clinical(rowd):
  snipetEmb = generateSnippetEmbeding_Clinical(rowd['Snippet'])
  transAuthEmb = GenerateEmbClinicalBERT(CompleteCleaning(rowd['TRANS_AUTHOR_BIO']))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  resultantvec = np.zeros(1536)
  resultantvec = np.vstack((resultantvec,vec))
  emb = encoder_1536to300.predict(resultantvec)
  return np.asarray(emb[1])

In [0]:
def Generate_1536Embedding_Clinical(rowd):
  snipetEmb = generateSnippetEmbeding_Clinical(rowd['Snippet'])
  transAuthEmb = GenerateEmbClinicalBERT(CompleteCleaning(rowd['TRANS_AUTHOR_BIO']))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  return np.asarray(vec)

In [0]:
ClinicalBertMat_300 = np.zeros(300)
count = 1
for _,x in Main_UsrDoc_df.head(1000).iterrows():
    ClinicalBertMat_300 = np.vstack((ClinicalBertMat_300,Generate_CombinedEmbedding_Clinical(x)))
    print("Epoch "+ str(count) + " / 1000")
    count = count + 1

In [0]:
ClinicalBertMat_300 = ClinicalBertMat_300[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/ClinicalBERT_UserDoc_Combined_Matrix.txt'
np.savetxt(fileName,ClinicalBertMat_300,fmt='%.8f')

In [0]:
ClinicalBertMat_1536 = np.zeros(1536)
count = 1
for _,x in Main_UsrDoc_df.head(1000).iterrows():
    ClinicalBertMat_1536 = np.vstack((ClinicalBertMat_1536,Generate_1536Embedding_Clinical(x)))
    print("Epoch "+ str(count) + " / 1000")
    count = count + 1

In [0]:
ClinicalBertMat_1536 = ClinicalBertMat_1536[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/ClinicalBERT_UserDoc_1536_Matrix.txt'
np.savetxt(fileName,ClinicalBertMat_1536,fmt='%.8f')

In [0]:
#################### MultiView Approach using Deep Walk network representation ################

In [0]:
import tensorflow as tf
import keras 
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input
from keras.layers import BatchNormalization,Activation
from keras.optimizers import Adam
from keras.losses import mean_squared_error

In [0]:
# Loading the matrix of user representation that contain 1536 vector of tweet and user about info data representation and 100 dimension of deep walk data representation
# The purpose if these matrix is to train and define an AutoEncoder to generate low dimension reduced vector.
MultiView_1636_mat = np.loadtxt('/content/drive/My Drive/Colab Notebooks/Embeddings/MultiView_1636_Matrix.txt')

In [0]:
main,X_train,X_test = MultiView_1636_mat[:1000],MultiView_1636_mat[1000:3500],MultiView_1636_mat[3500:]

In [0]:
#split into training and validation for autoencoder
np.random.shuffle(X_train)
np.random.shuffle(X_test)

In [0]:
encoding_dims = 550
input_dim = Input(shape = (1636,))
encoded = Dense(encoding_dims, activation='relu')(input_dim)
decoded = Dense(1636, activation='tanh')(encoded)
autoencoder = Model(input_dim,decoded)

encoder = Model(input_dim,encoded)

encoded_input = Input(shape=(encoding_dims,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

mse = tf.keras.losses.MeanSquaredError()
autoencoder.compile(optimizer='adadelta', loss=mse)

In [0]:
autoencoder.fit(X_train, X_train,
                epochs=500,
                batch_size=250,
                shuffle=True,
                validation_data=(X_test, X_test))

In [0]:
result_1000 = encoder.predict(main)

In [0]:
result_1000.shape

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/MultiView_DeepWalk_1636to550_Matrix.txt'
np.savetxt(fileName,result_1000,fmt='%.8f')

In [0]:
################ trying to improve benchmark #############

In [0]:
# Loading the matrix of a combination of reduced embedding of 300 dimension of user representation from tweet and about info and 100 dimension if deep walk representation
# The purpose of the matrix is to define and generate a reduced version of the 400 dimension data
MultiView_400_mat = np.loadtxt('/content/drive/My Drive/Colab Notebooks/Embeddings/MultiView_Biobert_reduced_400.txt')

In [0]:
main,X_train,X_test = MultiView_400_mat[:1000],MultiView_400_mat[1000:3500],MultiView_400_mat[3500:]

In [0]:
#split into training and validation for autoencoder
np.random.shuffle(X_train)
np.random.shuffle(X_test)

In [0]:
encoding_dims = 300
input_dim = Input(shape = (400,))
encoded = Dense(encoding_dims, activation='relu')(input_dim)
decoded = Dense(400, activation='tanh')(encoded)
autoencoder = Model(input_dim,decoded)

encoder = Model(input_dim,encoded)

encoded_input = Input(shape=(encoding_dims,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

mse = tf.keras.losses.MeanSquaredError()
autoencoder.compile(optimizer='adadelta', loss=mse)

In [0]:
autoencoder.fit(X_train, X_train,
                epochs=1000,
                batch_size=250,
                shuffle=True,
                validation_data=(X_test, X_test))

In [0]:
result_1000 = encoder.predict(main)

In [0]:
result_1000.shape

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/MultiView_DeepWalk_reduced_biobert_400to300_Matrix.txt'
np.savetxt(fileName,result_1000,fmt='%.8f')