In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
! pip install -q transformers[sentencepiece] fastai ohmeow-blurr nbdev
! pip install -q onnxruntime onnx==1.10.0 onnxruntime-gpu onnxruntime_tools

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
from fastai.text.all import *
from blurr.text.data.all import *
from blurr.text.modeling.all import *

In [None]:
import ast
data=pd.read_csv('/kaggle/input/ieee-xplore-paper-details/papers.csv')
data=data.dropna().reset_index(drop=True)
data=data.drop('link',axis=1)
data['text']=data.apply(lambda row: row['Title'] + row['abstract'], axis = 1)
data=data.drop('Title',axis=1)
data=data.drop('abstract',axis=1)
data['keywords']=data.apply(lambda row: ast.literal_eval(row['keywords']), axis = 1)
data

In [None]:
keyword_count={}
for i in data['keywords']:
    for keyword in i:
        try:
            keyword_count[keyword]+=1
        except:
            keyword_count[keyword]=1
print(len(keyword_count.keys()))


In [None]:
threshold = 20
common_keywords = [key for key, value in keyword_count.items() if value > threshold]
len(common_keywords)

In [None]:
import json
ohe_map={}
for i,keyword in enumerate(common_keywords):
    ohe_map[i]=keyword
with open("keys_encoded.json", "w") as fp:
  json.dump(ohe_map, fp)

In [None]:
keywords=[]
common_keywords= set(common_keywords)
refined_keys=[]
for i in data['keywords']:
    key_i=[]
    for key in i:
        if(key in common_keywords):
            key_i.append(key)
    refined_keys.append(key_i)

In [None]:
no_match=[] #papers having no common keywords
ohe_refined=[]
for i, keys in enumerate(refined_keys):
    found_one=False
    ohe=[]
    for val, key in ohe_map.items():
        if(key in keys):
            found_one=True
            ohe.append(1)
        else:
            ohe.append(0)
    ohe_refined.append(ohe)
    if(not found_one):
        no_match.append(i)    

In [None]:
data['keywords_ohe']=ohe_refined
data['keywords']=refined_keys
# data

In [None]:
data=data.drop(no_match)

In [None]:
splitter = RandomSplitter(valid_pct=0.1)
train_ids, valid_ids = splitter(data)
len(train_ids), len(valid_ids)

In [None]:
data=data.reset_index()
valid_df = data.loc[valid_ids]
train_df = data.loc[train_ids]

In [None]:
model_path = "/kaggle/input/ieee-keyword-predictiontrained-models/keyword-predictor.pkl"
learner_inf = load_learner(model_path)

In [None]:
learner_inf.blurr_predict("KNN is a state-of-the-art machine learning algorithm. this paper presents a new knn algorithm")[0]['labels']

In [None]:
# data

In [None]:
from sklearn import metrics

def metric_measures(test_df, preds):

  targets = [np.asarray(target) for target in test_df['keywords_ohe'].to_list()]
  outputs = [np.asarray(pred) for pred in preds]


  accuracy = metrics.accuracy_score(targets, outputs)
  f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
  f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

  print(f"F1 Score (Micro) = {f1_score_micro}")
  print(f"F1 Score (Macro) = {f1_score_macro}")

  return

In [None]:
valid_df.iterrows()

In [None]:
ohe_map_rev={}
for key, value in ohe_map.items():
    ohe_map_rev[value]=key

In [None]:
from tqdm.notebook import tqdm
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=len(valid_df)):
  desc = row['text']
  labels = learner_inf.blurr_predict(desc)[0]['labels']
  pred_genres = [0] * len(common_keywords)
  for label in labels:
    pred_genres[ohe_map_rev[label]] = 1
  preds.append(pred_genres)

preds[0][:20]

# Compression

In [None]:
!ls

In [None]:
! touch keyword-classifier.onnx

In [None]:
classifier = learner_inf.model.hf_model.eval()

torch.onnx.export(
    classifier, 
    torch.LongTensor([[0] * 512]),
    '/kaggle/working/keyword-classifier.onnx',
    input_names=['input_ids'],
    output_names=['output'],
    opset_version=13,
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence_len'},
        'output': {0: 'batch_size'}
    }
)

In [None]:
! touch keyword-classifier-quantized.onnx

In [None]:
! pip install onnxruntime

In [None]:

from onnxruntime.quantization import quantize_dynamic, QuantType

onnx_model_path = '/kaggle/working/keyword-classifier.onnx'
quantized_onnx_model_path = '/kaggle/working/keyword-classifier-quantized.onnx'

quantize_dynamic(
    onnx_model_path,
    quantized_onnx_model_path,
    weight_type=QuantType.QUInt8,
)

In [None]:
quantize_dynamic(
    onnx_model_path,
    quantized_onnx_model_path,
    weight_type=QuantType.QUInt8,
)

In [None]:
import onnxruntime as rt
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

class_labels = list(ohe_map_rev.keys())

inf_session = rt.InferenceSession('/kaggle/working/keyword-classifier.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

In [None]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=valid_df.shape[0]):
  desc = row['text']
  input_ids = tokenizer(desc)['input_ids'][:512]

  probs = inf_session.run([output_name], {input_name: [input_ids]})[0]
  probs = torch.FloatTensor(probs)

  masks = torch.sigmoid(probs) >= 0.5
  labels = [class_labels[idx] for idx, mask in enumerate(masks[0]) if mask]

  pred_genres = [0] * len(ohe_map_rev)
  for label in labels:
    pred_genres[ohe_map_rev[label]] = 1
  preds.append(pred_genres)

In [None]:
metric_measures(valid_df, preds) 

In [None]:
with open("keys_encoded_rev.json", "w") as fp:
  json.dump(ohe_map_rev, fp)

In [None]:
import onnxruntime as rt
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

class_labels = list(ohe_map_rev.keys())

inf_session = rt.InferenceSession('/kaggle/working/keyword-classifier-quantized.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

In [None]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=valid_df.shape[0]):
  desc = row['text']
  input_ids = tokenizer(desc)['input_ids'][:512]

  probs = inf_session.run([output_name], {input_name: [input_ids]})[0]
  probs = torch.FloatTensor(probs)

  masks = torch.sigmoid(probs) >= 0.5
  labels = [class_labels[idx] for idx, mask in enumerate(masks[0]) if mask]

  pred_genres = [0] * len(encode_genre_types)
  for label in labels:
    pred_genres[encode_genre_types[label]] = 1
  preds.append(pred_genres)

In [None]:
metric_measures(valid_df, preds) 