In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from transformers import BertTokenizer, TFBertModel
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import sent_tokenize

from tensorflow.keras.optimizers import SGD
df = pd.read_csv("/Users/revglue/study/main_work/my_working/my_data_set/my_data_set.csv")
df.head(5)

2022-09-05 10:33:08.040814: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,Annotator,Paper,Cited-by,Follow-up,Citing Sentence,Tagged Sentence
0,A,A00-1043,C00-2140,0,Since we only use shallow methods for textual ...,Since we only use shallow methods for textual ...
1,A,A00-1043,P02-1057,0,Sentence simplification systems (Chandrasekar ...,Sentence simplification systems (GTREF) are ca...
2,A,A97-1011,W09-1118,1,Each token is represented using a fairly stand...,Each token is represented using a fairly stand...
3,A,A97-1011,A00-2017,1,The training and the test data were processed ...,The training and the test data were processed ...
4,A,A97-1011,C00-2099,0,The only other high-delity computational rend...,The only other high-_x005fdelity computationa...


In [2]:
df['Citation Type']=df['Follow-up'].apply(lambda x: 'Related work' if x==0 else ('Comparison' if x==1 else ('Using the work' if x==2 else 'Extending the work')))

In [3]:
import math

def softmax(z):
    z_exp = [math.exp(i) for i in z]
    sum_z_exp = sum(z_exp)
    return [i / sum_z_exp for i in z_exp]

In [4]:
df['Coarse Label']=df['Follow-up'].apply(lambda x: 'Incidental' if x==0 or x==1 else 'Important')
df['normalized']=softmax(df['Follow-up'])

df.head(5)

Unnamed: 0,Annotator,Paper,Cited-by,Follow-up,Citing Sentence,Tagged Sentence,Citation Type,Coarse Label,normalized
0,A,A00-1043,C00-2140,0,Since we only use shallow methods for textual ...,Since we only use shallow methods for textual ...,Related work,Incidental,0.000508
1,A,A00-1043,P02-1057,0,Sentence simplification systems (Chandrasekar ...,Sentence simplification systems (GTREF) are ca...,Related work,Incidental,0.000508
2,A,A97-1011,W09-1118,1,Each token is represented using a fairly stand...,Each token is represented using a fairly stand...,Comparison,Incidental,0.00138
3,A,A97-1011,A00-2017,1,The training and the test data were processed ...,The training and the test data were processed ...,Comparison,Incidental,0.00138
4,A,A97-1011,C00-2099,0,The only other high-delity computational rend...,The only other high-_x005fdelity computationa...,Related work,Incidental,0.000508


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [6]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Tagged Sentence'])):
        tokenized_text = tokenizer.encode_plus(
            text.lower(),
            max_length=512, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [7]:
import numpy as np

X_input_ids = np.zeros((len(df), 512))
X_attn_masks = np.zeros((len(df), 512))

In [8]:
from tqdm.auto import tqdm

X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [9]:
labels = np.zeros((len(df), 4))
labels[np.arange(len(df)), df['Follow-up'].values] = 1
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [10]:
def CitationDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [11]:
dataset = dataset.map(CitationDatasetMapFunction)
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)
p = 0.8
train_size = int((len(df)/16)*p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [12]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
citation_type_ids = tf.keras.layers.Input(shape=(1,), name='citation_type_ids', dtype='int32')

In [13]:
citation_type_embedding = tf.keras.layers.Embedding(4, 30)(citation_type_ids)
citation_type_embedding = tf.keras.backend.squeeze(citation_type_embedding, 1)  

In [14]:
bert_embds = model(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D) 
citation_type_and_text = tf.keras.layers.concatenate([bert_embds, citation_type_embedding])

In [15]:
intermediate_layer = tf.keras.layers.Dense(1024, activation='relu', name='intermediate_layer')(citation_type_and_text)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

In [16]:
citation_model = tf.keras.Model(inputs=[input_ids, attn_masks, citation_type_ids], outputs=output_layer)
citation_model.summary() 

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 citation_type_ids (InputLayer)  [(None, 1)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 30)        120         ['citation_type_ids[0][0]']      
                                                                                              