In [0]:
# The purpose of the notebook is to generate embedding using different pretrained bert model and evaluate them locally on the colab environment

In [0]:
import nltk
#nltk.download('stopwords')
nltk.download('popular')

In [0]:
import pandas as pd
import numpy as np
import re
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec
import random
import string
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import spatial
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
lemmatizer = WordNetLemmatizer()

In [0]:
def convert2lower(inputstr):
    return inputstr.lower()

def removeNum(inputstr):
    return re.sub(r'\d+','',inputstr)

def removePunc(inputstr):
    for x in inputstr.lower(): 
        if x in punctuations: 
            inputstr = inputstr.replace(x, "")
    return inputstr

def removeWhiteSpace(inputstr):
    return inputstr.strip()

def removeStopWordsandLemmatize(inputstr):
    token = word_tokenize(inputstr)
    result = [i for i in token if not i in stop_words]
    result = [lemmatizer.lemmatize(i) for i in result]
    return ' '.join(result)

def CompletePreprocessingofWord(tweet):
    if(tweet == ''):
      return tweet
    clean_tweet = convert2lower(tweet)
    clean_tweet = removeNum(clean_tweet)
    clean_tweet = removePunc(clean_tweet)
    clean_tweet = removeStopWordsandLemmatize(clean_tweet)
    clean_tweet = removeWhiteSpace(clean_tweet)
    return clean_tweet


In [0]:
    def getuniquehashtags(tweetlist):
        allhashtags = []
        for tweet in tweetlist:
            allhashtags.extend(re.findall(r"#(\w+)",str(tweet)))
        return list(set(allhashtags))
    
    def getUniqueMentionedUsers(tweetlist):
        allmentionedUser = []
        for tweet in tweetlist:
            allmentionedUser.extend(re.findall(r"@(\w+)",str(tweet)))
        return list(set(allmentionedUser))
    
    def getAllhashtags(tweetlist):
        allhashtags = []
        for tweet in tweetlist:
            allhashtags.extend(re.findall(r"#(\w+)",str(tweet)))
        return allhashtags
    
    def getAllMentionedUsers(tweetlist):
        allmentionedUser = []
        for tweet in tweetlist:
            allmentionedUser.extend(re.findall(r"@(\w+)",str(tweet)))
        return allmentionedUser
    
    def RemoveTags(tweet):
      tweet = re.sub('@','at ',str(tweet))    #[^\s]+
      tweet = re.sub('#','',str(tweet))     #[^\s]+
      tweet = re.sub('RT','',str(tweet))
      return tweet  
    
    def RemoveMentionedUserAndHashTags(tweet):
      tweet = re.sub('@[^\s]+','',str(tweet))    #[^\s]+
      tweet = re.sub('#[^\s]+','',str(tweet))     #[^\s]+
      tweet = re.sub('RT','',str(tweet))
      return tweet 

    def RemoveHTTP(tweet):
      clean_tweet = re.match('(.*?)http.*?\s?(.*?)',str(tweet))
      if(clean_tweet):
        return clean_tweet.group(1)
      else:
        return tweet

    def RemoveHttps(tweet):
        clean_tweet = re.match('(.*?)https.*?\s?(.*?)',str(tweet))
        if(clean_tweet):
          return clean_tweet.group(1)
        else:
          return tweet

    def RemoveWWW(tweetlist):
        l = []
        for tweet in tweetlist:    
            clean_tweet = re.match('(.*?)www.*?\s?(.*?)',str(tweet))
            if(clean_tweet):
                l.append(clean_tweet.group(1))
            else:
                l.append(tweet)
            #l.append(re.sub(r"http:\+","",str(tweet)))
        return l
    
    def DeEmojify(tweet):
        #print(tweet)
        if(tweet == ''):
          return tweet
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
        
        tweet.encode('ascii', 'ignore').decode('ascii')
        return tweet

In [0]:
Main_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/MainDF.csv')

In [0]:
Main_df = Main_df[["AUTHOR_ID","TRANS_AUTHOR_BIO","Account Type","Snippet"]]

In [0]:
Main_df.head()

In [0]:
#########Cleaning the text#####################
for _,x in Main_df.iterrows():
  x["TRANS_AUTHOR_BIO"] = RemoveTags(x["TRANS_AUTHOR_BIO"])
  x["TRANS_AUTHOR_BIO"] = DeEmojify(x["TRANS_AUTHOR_BIO"])
  x["TRANS_AUTHOR_BIO"] = CompletePreprocessingofWord(x["TRANS_AUTHOR_BIO"])    #removeAbb   must remove in case you want to drag org separately 

In [0]:
######### Cleaning the tweets##################
for _,x in Main_df.iterrows():
  x['Snippet'] = RemoveTags(x['Snippet'])
  x['Snippet'] = RemoveHTTP(x['Snippet'])
  x['Snippet'] = RemoveHttps(x['Snippet'])
  x['Snippet'] = DeEmojify(x['Snippet'])
  x['Snippet'] = CompletePreprocessingofWord(x['Snippet'])


In [0]:
#############Using Sentence BioBERT model to embed user individual column embedding #################

!pip install biobert-embedding==0.1.2

In [0]:
from biobert_embedding.embedding import BiobertEmbedding
biobert_model = BiobertEmbedding()

In [0]:
Main_df['SnippetEmbedding'] = ""
Main_df['Autho_Info_Embedding'] = ""

In [0]:
Main_df = Main_df.head(4280)

In [0]:
count = 0
for _,x in Main_df.iterrows():
  x['SnippetEmbedding'] = np.asarray(biobert_model.sentence_vector(x['Snippet']))
  x['Autho_Info_Embedding'] = np.asarray(biobert_model.sentence_vector(x['TRANS_AUTHOR_BIO']))
  count = count + 1
  print("epoch --" + str(count) + " / 4280")

In [0]:
Main_df.to_csv('/content/drive/My Drive/Colab Notebooks/BioBertEmbedd_DF.csv', header=True, index=False) 

In [0]:
################Build AutoEncoder to reduce dimension#######################
####AutoEncoder1 to reduce 1536 dimension to 300 dimensions -- since we have to concatenate 2 embeddings together #######

import tensorflow as tf
import keras 
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input
from keras.layers import BatchNormalization,Activation
from keras.optimizers import Adam
from keras.losses import mean_squared_error


In [0]:
train_data_df = Main_df.tail(3000)
type(train_data_df['SnippetEmbedding'][1280])

In [0]:
#---- Creating Training Data-----
train_data_df = Main_df.tail(3000)
TotalMatrix = []
TotalMatrix = np.concatenate((TotalMatrix,train_data_df['SnippetEmbedding'][1280]), axis=None)
TotalMatrix = np.concatenate((TotalMatrix,train_data_df['Autho_Info_Embedding'][1280]), axis=None)
#-- test


In [0]:
TotalMatrix.shape

(3001, 1536)

In [0]:
for _,x in train_data_df.iterrows():
  vec = []
  vec = np.concatenate((vec,x['SnippetEmbedding']), axis=None)
  vec = np.concatenate((vec,x['Autho_Info_Embedding']), axis=None)
  TotalMatrix = np.vstack((TotalMatrix,vec))

In [0]:
np.random.shuffle(TotalMatrix)
training_data, test_data = TotalMatrix[:2500,:],TotalMatrix[2500:,:]

In [0]:
encoding_dims = 300
input_dim = Input(shape = (1536,))
encoded = Dense(encoding_dims, activation='relu')(input_dim)
decoded = Dense(1536, activation='tanh')(encoded)
autoencoder = Model(input_dim,decoded)

encoder = Model(input_dim,encoded)

encoded_input = Input(shape=(encoding_dims,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

mse = tf.keras.losses.MeanSquaredError()
autoencoder.compile(optimizer='adadelta', loss=mse)

In [0]:
autoencoder.fit(training_data, training_data,
                epochs=1000,
                batch_size=250,
                shuffle=True,
                validation_data=(test_data, test_data))

In [0]:
from google.colab import files
decoder.save('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_decoder.model')
autoencoder.save('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_autoencoder.h5')
encoder.save('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_encoder.h5')
encoder.save('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_encoder.model')

In [0]:
from keras.models import load_model
encoder = load_model('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_encoder.h5')

In [0]:
Main_df['Combined_Embedding'] = ""

In [0]:
count = 0
for _,x in Main_df.iterrows():
  vec = []
  vec = np.concatenate((vec,x['SnippetEmbedding']), axis=None)
  vec = np.concatenate((vec,x['Autho_Info_Embedding']), axis=None)
  mat = []
  mat = np.vstack((vec,vec))
  emb = encoder.predict(mat)
  x['Combined_Embedding'] = emb[0]
  count = count + 1
  print("epoch --" + str(count) + " / 4280")

In [0]:
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/TrainData_1.csv')

In [0]:
train_df = train_df.dropna()

In [0]:
for _,x in train_df.iterrows():
  rowd = Main_df.loc[Main_df['AUTHOR_ID'] == x['AUTHOR_ID']]
  ind = rowd.index.values.astype(int)[0]
  emb = Main_df.iloc[ind]['Combined_Embedding']

In [0]:
def TransferEmbedding(authorid):
  emb = np.zeros(300)
  rowd = Main_df.loc[Main_df['AUTHOR_ID'] == authorid]
  if(rowd.shape[0] > 0):
    ind = rowd.index.values.astype(int)[0]
    emb = Main_df.iloc[ind]['Combined_Embedding']
  return emb

In [0]:
train_df['Embedding'] = [TransferEmbedding(x['AUTHOR_ID']) for _,x in train_df.iterrows()]

In [0]:
Main_df.head()

In [0]:
###################Testing the Classification ######################
category1 = []
category2 = []
category3 = []
category4 = []
category5 = []
category6 = []
category7 = []
category8 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category5.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category6.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category7.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category8.append(trow['Embedding'])


In [0]:
type(category1[0])

In [0]:
from scipy import spatial
def measureSimilarityMeanofUser(userEmb,category):
    sumsc = 0
    for user in category:
        res = 1 - spatial.distance.cosine(user,userEmb)
        sumsc = sumsc + res
    
    mean = sumsc / len(category)
    return mean
    
def getlabelforUser(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category5))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category6))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category7))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category8))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]


In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)
category5 = customSplit(category5,5)
category6 = customSplit(category6,6)
category7 = customSplit(category7,7)
category8 = customSplit(category8,8)

In [0]:
len(labelList)

In [0]:
len(TestDataList)

In [0]:
########Evaluations############

In [0]:
correct_prediction = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction = correct_prediction + 1

correct_prediction

In [0]:
################ Reducing Bin to Nurse/ Doctors / organisation / Others #################

In [0]:
category1 = []
category2 = []
category3 = []
category4 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category2.append(trow['Embedding'])

In [0]:
def getlabelforUser(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]

In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)

In [0]:
len(labelList)

In [0]:
correct_prediction = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction = correct_prediction + 1

correct_prediction

In [0]:
###########Save the matrix #########################
MAT = []
MAT = Main_df['Combined_Embedding'][0]
for _,x in Main_df.iterrows():
  MAT = np.vstack((MAT,x['Combined_Embedding']))

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/BioBERT_Combined_AutoEnc_Emb_Matrix.txt'
np.savetxt(fileName,MAT,fmt='%.8f')

In [0]:
#######################Testing for Full Embedding #####################

In [0]:
def MultiColumnTransferEmbedding(authorid):
  vec = []
  rowd = Main_df.loc[Main_df['AUTHOR_ID'] == authorid]
  if(rowd.shape[0] > 0):
    ind = rowd.index.values.astype(int)[0]
    emb1 = Main_df.iloc[ind]['SnippetEmbedding']
    emb2 = Main_df.iloc[ind]['Autho_Info_Embedding']
    vec = np.concatenate((vec,emb1),axis=None)
    vec = np.concatenate((vec,emb2),axis=None)
  if(len(vec) == 0):
    return np.zeros(1536)
  else:
    return vec

In [0]:
train_df['Embedding'] = [MultiColumnTransferEmbedding(x['AUTHOR_ID']) for _,x in train_df.iterrows()]

In [0]:
train_df['Embedding'][0].shape

In [0]:
####################################Bio Clinical-BERT Evaluation #################################

In [0]:
!pip install transformers

In [0]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [0]:
import torch

In [0]:
def GenerateEmbClinicalBERT(text):
  input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
  output = model(input_ids)
  d1 = output[0].mean(1).detach().numpy()
  return d1

In [0]:
Clinical_DF = Main_df.copy()

In [0]:
#Clinical_DF['SnippetEmbedding'] = [GenerateEmbClinicalBERT(x['Snippet']) for _,x in Clinical_DF.iterrows()]
count = 1
for _,x in Clinical_DF.iterrows():
  x['SnippetEmbedding'] = GenerateEmbClinicalBERT(x['Snippet'])
  x['Autho_Info_Embedding'] = GenerateEmbClinicalBERT(x['TRANS_AUTHOR_BIO'])
  print("epoch  " + str(count) + " / 6000")
  count = count + 1

In [0]:
Clinical_DF = Clinical_DF.head(4280)

In [0]:
Clinical_DF['Combined_Embedding'] = ""          #Adding a new column to reduce the dimension of the two combined embedding

In [0]:
Clinical_DF.head()

In [0]:
from keras.models import load_model
encoder = load_model('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_encoder.h5')

In [0]:
def ReduceLargerEmbedding_toCombined(DataFrm):
  count = 0
  for _,x in DataFrm.iterrows():
    vec = []
    vec = np.concatenate((vec,x['SnippetEmbedding']), axis=None)
    vec = np.concatenate((vec,x['Autho_Info_Embedding']), axis=None)
    mat = []
    mat = np.vstack((vec,vec))
    emb = encoder.predict(mat)
    x['Combined_Embedding'] = emb[0]
    count = count + 1
    print("epoch --" + str(count) + " / 4280")
  return DataFrm

In [0]:
Clinical_DF = ReduceLargerEmbedding_toCombined(Clinical_DF)

In [0]:
Clinical_DF.head()

In [0]:
#####----- Evaluation of Embedding ------#########

In [0]:
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/TrainData_1.csv')
train_df = train_df.dropna()

In [0]:
def TransferEmbedding(authorid,DataFrm):
  emb = np.zeros(300)
  rowd = DataFrm.loc[DataFrm['AUTHOR_ID'] == authorid]
  if(rowd.shape[0] > 0):
    ind = rowd.index.values.astype(int)[0]
    emb = DataFrm.iloc[ind]['Combined_Embedding']
  else:
    print('zero')
  return emb

In [0]:
train_df['Embedding'] = [TransferEmbedding(x['AUTHOR_ID'],Clinical_DF) for _,x in train_df.iterrows()]

In [0]:
###################Testing the Classification ######################
category1 = []
category2 = []
category3 = []
category4 = []
category5 = []
category6 = []
category7 = []
category8 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category5.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category6.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category7.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category8.append(trow['Embedding'])


In [0]:
from scipy import spatial
def measureSimilarityMeanofUser(userEmb,category):
    sumsc = 0
    for user in category:
        res = 1 - spatial.distance.cosine(user,userEmb)
        sumsc = sumsc + res
    
    mean = sumsc / len(category)
    return mean
    
def getlabelforUser_8(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category5))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category6))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category7))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category8))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]


In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)
category5 = customSplit(category5,5)
category6 = customSplit(category6,6)
category7 = customSplit(category7,7)
category8 = customSplit(category8,8)

In [0]:
len(TestDataList)

219

In [0]:
correct_prediction_combined_8 = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser_8(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction_combined_8 = correct_prediction_combined_8 + 1

correct_prediction_combined_8

55

In [0]:
################ Reducing Bin to 4 #################

In [0]:
category1 = []
category2 = []
category3 = []
category4 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category2.append(trow['Embedding'])

In [0]:
def getlabelforUser_4(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]

In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)

In [0]:
correct_prediction_combined_4 = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser_4(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction_combined_4 = correct_prediction_combined_4 + 1

correct_prediction_combined_4

68

In [0]:
len(TestDataList)

In [0]:
def MultiColumnTransferEmbedding(authorid,DataFrm):
  vec = []
  rowd = DataFrm.loc[DataFrm['AUTHOR_ID'] == authorid]
  if(rowd.shape[0] > 0):
    ind = rowd.index.values.astype(int)[0]
    emb1 = DataFrm.iloc[ind]['SnippetEmbedding']
    emb2 = DataFrm.iloc[ind]['Autho_Info_Embedding']
    vec = np.concatenate((vec,emb1),axis=None)
    vec = np.concatenate((vec,emb2),axis=None)
  if(len(vec) == 0):
    return np.zeros(1536)
  else:
    return vec

In [0]:
train_df['Embedding'] = [MultiColumnTransferEmbedding(x['AUTHOR_ID'],SciBERT_DF) for _,x in train_df.iterrows()]

In [0]:
train_df['Embedding'][0].shape

In [0]:
###################Testing the Classification ######################
category1 = []
category2 = []
category3 = []
category4 = []
category5 = []
category6 = []
category7 = []
category8 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category5.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category6.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category7.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category8.append(trow['Embedding'])


In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]


In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)
category5 = customSplit(category5,5)
category6 = customSplit(category6,6)
category7 = customSplit(category7,7)
category8 = customSplit(category8,8)

In [0]:
correct_prediction_All_8 = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction_All_8 = correct_prediction_All_8 + 1

correct_prediction_All_8

57

In [0]:
category1 = []
category2 = []
category3 = []
category4 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 1.0):
    category1.append(trow['Embedding'])
  if(trow['Label'] == 2.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 3.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 4.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 5.0):
    category4.append(trow['Embedding'])
  if(trow['Label'] == 6.0):
    category2.append(trow['Embedding'])
  if(trow['Label'] == 7.0):
    category3.append(trow['Embedding'])
  if(trow['Label'] == 8.0):
    category2.append(trow['Embedding'])

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]

In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)
category3 = customSplit(category3,3)
category4 = customSplit(category4,4)

In [0]:
correct_prediction_All_4 = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction_All_4 = correct_prediction_All_4 + 1

correct_prediction_All_4

68

In [0]:
#################Saving the Matrix for further evaluations later ##############

In [0]:
SciBERT_DF.head()

In [0]:
#Change the column one by one and change it
MAT = []
MAT = Clinical_DF['Autho_Info_Embedding'][0]
for _,x in Clinical_DF.iterrows():
  MAT = np.vstack((MAT,x['Autho_Info_Embedding']))

In [0]:
MAT.shape

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/ClinalBERT_Autho_Info_Embedding_Matrix.txt'
np.savetxt(fileName,MAT,fmt='%.8f')

In [0]:
######################################## SCI BERT ########################################################

In [0]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [0]:
def GenerateEmbSCiBERT(text):
  input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
  output = model(input_ids)
  d1 = output[0].mean(1).detach().numpy()
  return d1

In [0]:
SciBERT_DF = Main_df.copy()

In [0]:
#Clinical_DF['SnippetEmbedding'] = [GenerateEmbClinicalBERT(x['Snippet']) for _,x in Clinical_DF.iterrows()]
count = 1
for _,x in SciBERT_DF.iterrows():
  x['SnippetEmbedding'] = GenerateEmbSCiBERT(x['Snippet'])
  x['Autho_Info_Embedding'] = GenerateEmbSCiBERT(x['TRANS_AUTHOR_BIO'])
  print("epoch  " + str(count) + " / 6000")
  count = count + 1

In [0]:
SciBERT_DF = SciBERT_DF.head(4280)

In [0]:
SciBERT_DF['Combined_Embedding'] = ""          #Adding a new column to reduce the dimension of the two combined embedding

In [0]:
SciBERT_DF.head()

In [0]:
from keras.models import load_model
encoder = load_model('/content/drive/My Drive/Colab Notebooks/BERT_1536to300_encoder.h5')

In [0]:
def ReduceLargerEmbedding_toCombined(DataFrm):
  count = 0
  for _,x in DataFrm.iterrows():
    vec = []
    vec = np.concatenate((vec,x['SnippetEmbedding']), axis=None)
    vec = np.concatenate((vec,x['Autho_Info_Embedding']), axis=None)
    mat = []
    mat = np.vstack((vec,vec))
    emb = encoder.predict(mat)
    x['Combined_Embedding'] = emb[0]
    count = count + 1
    print("epoch --" + str(count) + " / 4280")
  return DataFrm

In [0]:
SciBERT_DF = ReduceLargerEmbedding_toCombined(SciBERT_DF)

In [0]:
SciBERT_DF.head()

In [0]:
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/TrainData_1.csv')
train_df = train_df.dropna()

In [0]:
def TransferEmbedding(authorid,DataFrm):
  emb = np.zeros(300)
  rowd = DataFrm.loc[DataFrm['AUTHOR_ID'] == authorid]
  if(rowd.shape[0] > 0):
    ind = rowd.index.values.astype(int)[0]
    emb = DataFrm.iloc[ind]['Combined_Embedding']
  else:
    print('zero')
  return emb

In [0]:
train_df['Embedding'] = [TransferEmbedding(x['AUTHOR_ID'],SciBERT_DF) for _,x in train_df.iterrows()]

In [0]:
#########Rest all the process for evaluation run as same as above ##### Make changes accordingly#################

In [0]:
SciBERT_DF.head()

In [0]:
#Change the column one by one and change it
MAT = []
MAT = SciBERT_DF['Combined_Embedding'][0]
for _,x in SciBERT_DF.iterrows():
  MAT = np.vstack((MAT,x['Combined_Embedding']))

In [0]:
MAT.shape

(4281, 300)

In [0]:
fileName = '/content/drive/My Drive/Colab Notebooks/SciBERT_Combined_Embedding_Matrix.txt'
np.savetxt(fileName,MAT,fmt='%.8f')

In [0]:
#####################Classifying only on two categories HCP and Others ######################

In [0]:
category1 = []
category2 = []

for _,trow in train_df.iterrows():
  if(trow['Label'] == 5.0):
    category2.append(trow['Embedding'])
  else:
    category1.append(trow['Embedding'])

In [0]:
labelList = []
TestDataList = []

def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]

In [0]:
def getlabelforUser_2(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    #listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    #listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
len(TestDataList)

279

In [0]:
category1 = customSplit(category1,1)
category2 = customSplit(category2,2)

In [0]:
correct_prediction_All_2 = 0
for i in range(len(TestDataList)):
  correct_label = labelList[i]
  prediction = getlabelforUser_2(TestDataList[i])
  if(correct_label == prediction):
    correct_prediction_All_2 = correct_prediction_All_2 + 1

correct_prediction_All_2

In [0]:
###################################### Creating Final Printing of Evaluations #############################
###########################################################################################################
###########################################################################################################

In [0]:
def getlabelforUser_4(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
def getlabelforUser_2(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    #listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    #listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
from scipy import spatial
def measureSimilarityMeanofUser(userEmb,category):
    sumsc = 0
    for user in category:
        res = 1 - spatial.distance.cosine(user,userEmb)
        sumsc = sumsc + res
    
    mean = sumsc / len(category)
    return mean
    
def getlabelforUser_8(userEmb):
    listScoreUser = []
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category1))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category2))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category3))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category4))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category5))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category6))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category7))
    listScoreUser.append(measureSimilarityMeanofUser(userEmb,category8))
    
    m =  max(listScoreUser)
    
    return (listScoreUser.index(m) + 1)

In [0]:
def customSplit(userEmList,label):
  n = len(userEmList) - 10
  for i in range(n):
    labelList.append(label)
  TestDataList.extend(userEmList[10:])
  return userEmList[:10]

In [0]:
def printResult_AllEvaluation(train_df):
  #Starting 8 bin categorisation and Evaluation 
  category1 = []
  category2 = []
  category3 = []
  category4 = []
  category5 = []
  category6 = []
  category7 = []
  category8 = []

  for _,trow in train_df.iterrows():
    if(trow['Label'] == 1.0):
      category1.append(trow['Embedding'])
    if(trow['Label'] == 2.0):
      category2.append(trow['Embedding'])
    if(trow['Label'] == 3.0):
      category3.append(trow['Embedding'])
    if(trow['Label'] == 4.0):
      category4.append(trow['Embedding'])
    if(trow['Label'] == 5.0):
      category5.append(trow['Embedding'])
    if(trow['Label'] == 6.0):
      category6.append(trow['Embedding'])
    if(trow['Label'] == 7.0):
      category7.append(trow['Embedding'])
    if(trow['Label'] == 8.0):
      category8.append(trow['Embedding'])

  labelList = []
  TestDataList = []
  category1 = customSplit(category1,1)
  category2 = customSplit(category2,2)
  category3 = customSplit(category3,3)
  category4 = customSplit(category4,4)
  category5 = customSplit(category5,5)
  category6 = customSplit(category6,6)
  category7 = customSplit(category7,7)
  category8 = customSplit(category8,8)

  correct_prediction_All_8 = 0
  for i in range(len(TestDataList)):
    correct_label = labelList[i]
    prediction = getlabelforUser(TestDataList[i])
    if(correct_label == prediction):
      correct_prediction_All_8 = correct_prediction_All_8 + 1

  print("Prediction for Embedding Classification when classifying 8 categories")
  print("correct classification = " + str(correct_prediction_All_8) + " / " + len(TestDataList) + " Mean : " + str(correct_prediction_All_8 / len(TestDataList)))
