In [0]:
# The purpose of the notebook is to define the code for updating the user embedding 
#either using user doc approach or using post doc approach

In [0]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('popular')
from scipy import spatial
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
lemmatizer = WordNetLemmatizer()

In [0]:
################ Cleaning the tweets #################
# lowercase
# remove http
# remove punctuations
# remove @
# remove #
# remove stopwords or lemmatize

def RemoveHTTP(tweet):
  clean_tweet = re.match('(.*?)https.*?\s?(.*?)',str(tweet))
  if(clean_tweet):
    return clean_tweet.group(1)
  else:
    return tweet

def removePunc(inputstr):
  for x in inputstr.lower():
    if x in punctuations:
      inputstr = inputstr.replace(x, "")
  return inputstr

def Clean(tweet):
  clean_tweet = re.sub('@[^\s]+','',str(tweet))   
  clean_tweet = re.sub('#','',str(clean_tweet))
  clean_tweet = re.sub('RT','',str(clean_tweet))
  clean_tweet = removePunc(clean_tweet)
  return clean_tweet
  #tweet = tweet.strip()

def removeStopWordsandLemmatize(inputstr):
  token = word_tokenize(inputstr)
  result = [i for i in token if not i in stop_words]
  result = [lemmatizer.lemmatize(i) for i in result]
  return ' '.join(result)

In [0]:
def CompleteCleaning(tweet):
  clean_tweet = RemoveHTTP(tweet)
  clean_tweet = Clean(clean_tweet)
  clean_tweet = removeStopWordsandLemmatize(clean_tweet)
  return clean_tweet

In [0]:
import tensorflow as tf
import keras 
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input
from keras.layers import BatchNormalization,Activation
from keras.optimizers import Adam
from keras.losses import mean_squared_error

In [0]:
#We have a pre trained autoencoder model that reduces the input vector of size 1536 to 768 dimension. 
# We are using the encoder model of the same autoencoder here to reduce the dimension
from keras.models import load_model
encoder_1536to768 = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/tweets/encoder_1536to768.h5')

In [0]:
#Using bio bert model to generate embedding
!pip install biobert-embedding==0.1.2
from biobert_embedding.embedding import BiobertEmbedding
biobert_model = BiobertEmbedding()

In [0]:
# This function accepts a list of tweets which are text, clean them and then generate embedding of those one at a time, 
#Following which it merges two embedding at a time using the encoder. This is necessary the process of user doc method.
def generateSnippetEmbeding(tweetList):
  tweetEmbed = []
  for tweet in tweetList:
    emb = biobert_model.sentence_vector(CompleteCleaning(tweet))
    tweetEmbed.append(emb)
  mainvec = tweetEmbed[0]
  if(len(tweetList) > 1):  
    for i in range(1,len(tweetEmbed)):
      vec = np.concatenate((mainvec,tweetEmbed[i]), axis = None)
      fmax = vec
      fmax = np.vstack((fmax,vec))
      emb = encoder_1536to768.predict(fmax)
      mainvec = emb[0]
  return np.asarray(mainvec) 

In [0]:
# This function generate a vector of 1536 dimension vector , from 2 vcetors of 768 dimension each. One vector of 768 dim is generated from tweet data
# another 768 dim vector is generated from user info. 
def Generate_1536Embedding(rowd):
  snipetEmb = generateSnippetEmbeding(rowd['Snippet'])
  transAuthEmb = np.asarray(biobert_model.sentence_vector(CompleteCleaning(rowd['TRANS_AUTHOR_BIO'])))
  vec = []
  vec = np.concatenate((snipetEmb,transAuthEmb),axis = None)
  return np.asarray(vec)

In [0]:
#This function is used to change or rather replace the Author Info only. 
def ChangeAuthorEmbOnly(rowd,embed):
  tweetEmb,authoremb = embed[:768],embed[768:]
  transAuthEmb = np.asarray(biobert_model.sentence_vector(CompleteCleaning(rowd['NewIntro'])))
  vec = []
  vec = np.concatenate((tweetEmb,transAuthEmb),axis = None)
  return np.asarray(vec)


In [0]:
# Using the best performing Bio BERT pre trained model update the new tweets to check if user embeddings are updated. 
def UpdateTweet_UserDoc_BioBert(row,embed):
  tweetEmb,authoremb = embed[:768],embed[768:]
  embedding_newtweet = generateSnippetEmbeding(row['newtweets'])
  vec = np.concatenate((tweetEmb,embedding_newtweet), axis = None)
  fmax = vec
  fmax = np.vstack((fmax,vec))
  emb = encoder_1536to768.predict(fmax)
  snipetEmb = np.asarray(emb[0])
  return np.asarray(np.concatenate((snipetEmb,authoremb),axis = None))


In [0]:
df = pd.read_excel('/content/drive/My Drive/Colab Notebooks/20200507_Cardiology_HCP_Data.xlsx')

In [0]:
Embedding_Column_Name = ["AUTHOR_ID", "TRANS_AUTHOR_BIO","Account Type","Snippet"]
Main_DF = df[Embedding_Column_Name]
Main_DF = Main_DF.groupby('AUTHOR_ID',as_index= False)["TRANS_AUTHOR_BIO","Account Type","Snippet"].agg(lambda x: list(x))

In [0]:
# Remove duplicate data
for columName in Embedding_Column_Name[1:]:
    for _,x in Main_DF.iterrows():
        x[columName] = list(set(x[columName]))

In [0]:
Main_DF['NewIntro'] = ""
Main_DF['newtweets'] = ""

In [0]:
rowd = Main_DF.loc[Main_DF['AUTHOR_ID'] == 'BoniBlondie']   # Testing a user 

In [0]:
originalEmb = Generate_1536Embedding(rowd)

In [0]:
rowd['NewIntro'] = 'pilot firefighter'   # updating with a new author Info to test user

In [0]:
authorEmbNew = ChangeAuthorEmbOnly(rowd,originalEmb)  # Changing only the author info part of the vector.

In [0]:
# Updating the tweets for the user, treating this as new tweets posted by the user.
rowd['newtweets'][373] = ['RT @skathire As a cardiology consultant in the hospital, hereâ€™s my nomination for the heroes of the pandemic: 1. Nurses in the ER 2. Nurses in the ICU 3. Nurses on the floor 4. Nurses in the dialysis clinic 5. Nurses in the...']

In [0]:
# Updating the tweet vector of the embedding 
TweetEmbNew = UpdateTweet_UserDoc_BioBert(rowd,originalEmb)

In [0]:
#  Generating a matrix, this matrix could be downladed on local machine and evaluation could be performed on jupyter notebook code.
Mat = np.zeros(1536)
Mat = np.vstack((Mat,originalEmb))
Mat = np.vstack((Mat,authorEmbNew))
Mat = np.vstack((Mat,TweetEmbNew))

In [0]:
Mat = Mat[1:,:]

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/BioBERT_UserDoc_update_testuser2_Matrix.txt'
np.savetxt(fileName,Mat,fmt='%.8f')