In [1]:
!pip install pretty-confusion-matrix
!pip install transformers
!pip install pytorch-transformers
!pip install -U sentence-transformers

Collecting pretty-confusion-matrix
  Downloading pretty_confusion_matrix-0.1.1-py3-none-any.whl (9.6 kB)
Collecting flake8<4.0.0,>=3.9.2
  Downloading flake8-3.9.2-py2.py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.1/73.1 kB[0m [31m676.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pre-commit<3.0.0,>=2.12.1
  Downloading pre_commit-2.20.0-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.5/199.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting black<22.0,>=21.5b0
  Downloading black-21.12b0-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.7/156.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tomli<2.0.0,>=0.2.6
  Downloading tomli-1.2.3-py3-none-any.whl (12 kB)
Collecting pyflakes<2.4.0,>=2.3.0
  Downloading pyflakes-2.3.1-py2.py3-none-any.whl (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np 
import pandas as pd 
import csv
import pickle
import matplotlib.pyplot as plt

In [3]:
import tensorflow as tf
from keras.layers import Bidirectional, Input, Dense, Layer, Dropout, LSTM, Embedding, Flatten
from keras.models import Sequential, Model
from tensorflow.python.keras.callbacks import EarlyStopping
from keras import backend as K

In [4]:
POLITENESS_LEVELS = 5
EPOCHS = 30
MAXLEN = 768 # Since SciBERT returns 768 embeddings vector
LSTM_UNITS = 256
is_BiLSTM = True # Flag to automate other pre-processing for With or Without BiLSTM variants
VOCAB_LEN = 1853
EMBEDDING_DIMENSION = 300

In [30]:
start_path = '/kaggle/input/iitpolitenesslevels/'

X_val = pd.read_csv(start_path+'val.csv')
y_val = pd.read_csv(start_path+'y_val.csv') 

# LOAD EMBEDS
LOAD_PATH = start_path+'SCIBERT'+'_val.pickle'
with open(LOAD_PATH, 'rb') as handle:
    sci_val_embeds = pickle.load(handle)
    sci_val_embeds = np.array([row[0] for row in sci_val_embeds ])
    handle.close()
 
LOAD_PATH = start_path+'HATE-BERT'+'_val.pickle'
with open(LOAD_PATH, 'rb') as handle:
    hate_val_embeds = pickle.load(handle)
    hate_val_embeds = np.array([row[0] for row in hate_val_embeds ])
    handle.close()
    
LOAD_PATH = start_path+'TOXIC-BERT'+'_val.pickle'
with open(LOAD_PATH, 'rb') as handle:
    toxic_val_embeds = pickle.load(handle)
    toxic_val_embeds = np.array([row[0] for row in toxic_val_embeds ])
    handle.close()


LOAD_PATH = start_path+'Tokennized_Processed_X_val-BiLSTM.csv'
custom_val_embeds = pd.read_csv(LOAD_PATH)



In [31]:
X_val

Unnamed: 0,review,HIMP,IMP,N,P,HP,Tone
0,"The approach presented is simple , clearly pre...",0,0,1,0,0,3.0
1,One minor suggestion for improving readability...,0,0,1,0,0,3.0
2,"It is at best of little value and, in the wors...",0,1,0,0,0,2.0
3,"1.Are the samples sequenced? If yes, it will b...",0,0,0,0,1,5.0
4,Quality : Im intrigued by but a little uncomfo...,0,1,0,0,0,2.0
...,...,...,...,...,...,...,...
279,This leads to the proposed algorithm called DA...,0,0,0,1,0,4.0
280,I also think that the authors might benefit fr...,0,0,0,1,0,4.0
281,"From this perspective , I wish to see more mat...",0,0,1,0,0,3.0
282,Usually climate studies do not show a good met...,0,0,1,0,0,3.0


## LOAD MODEL

In [32]:
class Attention(Layer):
    
    def __init__(self, return_sequences=True, **kwargs):
        super(Attention,self).__init__()
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def get_config(self):
        config = super(Attention, self).get_config().copy()
        config.update({
            'return_sequences': self.return_sequences , 
        })
        return config


    def build(self, input_shape):
        
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")
        
        super(Attention,self).build(input_shape)
        
    def call(self, x):
        e =K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        a =K.softmax(e)
        a=K.expand_dims(a,axis=-1)
        output = x*a
        
        return K.sum(output, axis=1) 

In [33]:
# LOAD MODEL
from keras.models import load_model
def loadModel(name, PATH, X, y):
    model = load_model(PATH, custom_objects={'Attention': Attention})
    print(name+" MODEL LOADED\n\n")
    model.evaluate(X, y)
    
    return model

### A) HATE-BERT

In [34]:
PATH = '/kaggle/input/iitpolitenesslevels/Politeness_HATE-BERT.h5'
hateBert_model = loadModel('HATE-BERT', PATH, hate_val_embeds, y_val)

HATE-BERT MODEL LOADED




### B) SCIBERT

In [35]:
PATH = '/kaggle/input/iitpolitenesslevels/Politeness_SCIBERT.h5'
sciBert_model = loadModel('SCIBERT', PATH, sci_val_embeds, y_val)

SCIBERT MODEL LOADED




### C) TOXIC-BERT

In [36]:
PATH = '/kaggle/input/iitpolitenesslevels/Politeness_TOXIC-BERT.h5'
toxicBert_model = loadModel('TOXIC-BERT', PATH, toxic_val_embeds, y_val)

TOXIC-BERT MODEL LOADED




### D) Custom Embed

In [41]:
PATH = '/kaggle/input/iitpolitenesslevels/Politeness_Custom-Embedding-BiLSTM.h5'
custom_model = loadModel('Custom Embed', PATH, custom_val_embeds, y_val)

Custom Embed MODEL LOADED




# VALIDATION

In [66]:
def adjustIndex(arr):
    return [x+1 for x in arr]

In [67]:
y_pred_SCI = sciBert_model.predict(sci_val_embeds)
y_pred_HATE = hateBert_model.predict(hate_val_embeds)
y_pred_TOXIC = toxicBert_model.predict(toxic_val_embeds)
y_pred_Custom = toxicBert_model.predict(custom_val_embeds)

y_pred_SCI_idx = adjustIndex(np.argmax(y_pred_SCI, axis=1))
y_pred_HATE_idx = adjustIndex(np.argmax(y_pred_HATE, axis=1))
y_pred_TOXIC_idx = adjustIndex(np.argmax(y_pred_TOXIC, axis=1))
y_pred_Custom_idx = adjustIndex(np.argmax(y_pred_Custom, axis=1))

y_val_true_idx = adjustIndex(np.argmax(y_val.values, axis=1))

In [68]:
# CONCATENATE RESULTS
results = pd.DataFrame()
results['reviews'] = X_val.review
results['True_Val'] = y_val_true_idx
results['SciBert'] = y_pred_SCI_idx
results['HateBert'] = y_pred_HATE_idx
results['ToxicBert'] = y_pred_TOXIC_idx
results['CudstomEmbed'] = y_pred_Custom_idx
results

Unnamed: 0,reviews,True_Val,SciBert,HateBert,ToxicBert,CudstomEmbed
0,"The approach presented is simple , clearly pre...",3,5,4,4,3
1,One minor suggestion for improving readability...,3,5,4,4,5
2,"It is at best of little value and, in the wors...",2,2,2,2,2
3,"1.Are the samples sequenced? If yes, it will b...",5,5,5,5,2
4,Quality : Im intrigued by but a little uncomfo...,2,5,5,4,1
...,...,...,...,...,...,...
279,This leads to the proposed algorithm called DA...,4,4,4,4,1
280,I also think that the authors might benefit fr...,4,4,4,3,5
281,"From this perspective , I wish to see more mat...",3,2,5,2,5
282,Usually climate studies do not show a good met...,3,3,4,3,3


In [69]:
results.to_csv('baselines_val_labels.csv', index=False)

In [63]:
correct = []
for index, row in results.iterrows():
    labels = ['True_Val', 'SciBert', 'HateBert', 'ToxicBert', 'CudstomEmbed']
#     print(row)
    if sum(row[labels].values)/(len(labels)-1) == row['True_Val']:
        correct.append(row)
        
pd.DataFrame(correct)

Unnamed: 0,reviews,True_Val,SciBert,HateBert,ToxicBert,CudstomEmbed
10,"In fact , it is not difficult to design exampl...",2,2,2,2,0
11,I showed this paper to my nurses and they agre...,1,1,1,1,0
17,"Cite newer, relevant references, especially th...",4,4,4,4,0
19,This is very bad .,1,1,1,1,0
20,"Please, consider carefully my comments in the ...",4,4,4,4,0
...,...,...,...,...,...,...
261,"If such were present , Id rate this paper sign...",1,0,1,2,0
262,I congratulate the author for the work,4,4,4,4,0
265,Judging from the description of the experiment...,2,2,2,2,0
278,This looks like a work of pure fantasy.,1,1,1,1,0
