In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install tensorflow-addons
!pip install Keras-Preprocessing

Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [19]:
import pandas as pd
import pickle
import numpy as np

import tensorflow as tf
import keras
import tensorflow_addons as tfa
from keras.models import load_model
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences


In [20]:
FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/"

In [21]:
test_data = pd.read_csv(FILE_PATH + "test_data.csv")

In [22]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [23]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [25]:
model_path = FILE_PATH + "model2.h5"
custom_objects = {"TransformerBlock": TransformerBlock,
                  "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
                  "HammingLoss" : tfa.metrics.HammingLoss(mode='multilabel')}
loaded_model = load_model(model_path, custom_objects=custom_objects)

In [38]:
import nltk
import re
nltk.download("stopwords")
from nltk.stem import PorterStemmer
ps = PorterStemmer()

from nltk.corpus import stopwords
stop_word_collection = stopwords.words('english')

import string

def text_preprocess(text):
  # Remove all punctuations
  text = ''.join(c for c in text if c not in string.punctuation)

  # Remove all numbers and words containing numbers
  text = re.sub(r'\w*\d\w*', ' ', text).strip()

  # Changes to lower case
  text = text.lower()

  # Remove all stop words
  text = ' '. join(word for word in text.split() if word not in stop_word_collection)

  # Stemming of all words
  text = [ps.stem(word) for word in text.split()]
  text = ' '.join(text)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
def predict(text, tokenizer, labels_list, preprocessed = False, top_k = 3):
  """
  Input:
    text: input text
    tokenizer: word tokenizer for the text
    labels_list: list of all the labels we have in our data
    preprocessed: Whether the input data is already processed or not
    top_k: no. of labels to be returned along with their probabililties

  Output:
    top_k no. of labels along with their corresponding probabilities
  """
  if not preprocessed:
    text = text_preprocess(text)
  text_sequence = tokenizer.texts_to_sequences([text])
  text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
  predictions = list(loaded_model.predict(text_padded)[0])
  # get the indices of top three values
  top_indices = [i for i, val in sorted(enumerate(predictions), key=lambda x: x[1], reverse=True)[:top_k]]
  # Create a new list with 1s for top three indices and 0s for the rest
  prediction_list = [1 if i in top_indices else 0 for i in range(len(predictions))]

  # For top k labels, the label name and their corresponding probabilities are provided
  labels = [labels_list[i] for i in range(len(labels_list)) if prediction_list[i] == 1]
  prediction_probabilities = [predictions[i] for i in range(len(predictions)) if prediction_list[i] == 1]
  sorted_data = sorted(zip(labels, prediction_probabilities), key=lambda x: x[1], reverse=True)

  return sorted_data

In [112]:
# Step 1: Load the labels list and the tokenizer
# Step 2: Predict the labels
with open(FILE_PATH + 'label_name.pkl', 'rb') as handle:
  labels_list = pickle.load(handle)
with open(FILE_PATH + 'tokenizer.pkl', 'rb') as handle:
  tokenizer = pickle.load(handle)

text = X_test[0]
predict(text, tokenizer, labels_list = labels_list, preprocessed = True, top_k = 5)




[('hep-ph', 0.9601804),
 ('hep-ex', 0.22531293),
 ('astro-ph', 0.19884947),
 ('astro-ph.HE', 0.055537507),
 ('nucl-th', 0.020627443)]

In [113]:
text = """This study significantly concentrates on cryogenic InP HEMT high-frequency circuit analysis using quantum theory to find how the transistor nonlinearity can affect the quantum correlation of the modes generated. Firstly, the total Hamiltonian of the circuit is derived, and the dynamic equation of the motion contributed is examined using the Heisenberg-Langevin equation. Using the nonlinear Hamiltonian, some components are attached to the intrinsic internal circuit of InP HEMT to address the circuit characteristics fully. The components attached are arisen due to the nonlinearity effects. As a result, the theoretical calculations show that the states generated in the circuit are mixed, and no pure state is produced. Accordingly, the modified circuit generates the two-mode squeezed thermal state, which means one can focus on calculating the Gaussian quantum discord to evaluate quantum correlation. It is also found that the nonlinearity factors (addressed as the nonlinear components in the circuit) can intensely influence the squeezed thermal state by which the quantum discord is changed. Finally, as the primary point, it is concluded that although it is possible to enhance the quantum correlation between modes by engineering the nonlinear components; however, attaining quantum discord greater than unity, """
predict(text, tokenizer, labels_list = labels_list, preprocessed = False, top_k = 5)



[('quant-ph', 0.93055),
 ('cond-mat.mes-hall', 0.20567559),
 ('physics.optics', 0.07040421),
 ('cond-mat.other', 0.057682544),
 ('cond-mat.stat-mech', 0.03836287)]

In [88]:
label_name = list(test_data.columns)
label_name.pop(0)
X_test, y_test = test_data["text"], test_data[label_name]

In [92]:
## Explanation steps of the prediction function
MAX_PAD_LENGTH = 210
text = "quantum physics is difficult."
text = text_preprocess(text)
print(text)
text_sequence = tokenizer.texts_to_sequences([text])
print(text_sequence)
text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
print(text_padded)
print("Prediction: ")
print(loaded_model.predict(text_padded))

quantum physic difficult
[[23, 131, 1282]]
[[  23  131 1282    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0