In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
!pip install tensorflow-addons
!pip install Keras-Preprocessing

[0mCollecting tensorflow-addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m680.9 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:02[0m
[?25hCollecting typeguard<3.0.0,>=2.7
  Using cached typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.21.0 typeguard-2.13.3
Collecting Keras-Preprocessing
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting numpy>=1.9.1
  Downloading numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m:01[0m0:01[0m
[?25hInstalling collected packages: numpy, Keras-Preprocessing
Successfully

In [3]:
import pandas as pd
import pickle
import numpy as np

import nltk
import re
nltk.download("stopwords")
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
stop_word_collection = stopwords.words('english')
import string
from tqdm import tqdm

import tensorflow as tf
import keras
import tensorflow_addons as tfa
from keras.models import load_model
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nirajan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
2023-07-19 18:48:04.300005: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-19 18:48:04.538854: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-19 18:48:04.539987: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dep

In [4]:
def text_preprocess(text):
  # Remove all punctuations
  text = ''.join(c for c in text if c not in string.punctuation)

  # Remove all numbers and words containing numbers
  text = re.sub(r'\w*\d\w*', ' ', text).strip()

  # Changes to lower case
  text = text.lower()

  # Remove all stop words
  text = ' '. join(word for word in text.split() if word not in stop_word_collection)

  # Stemming of all words
  text = [ps.stem(word) for word in text.split()]
  text = ' '.join(text)
  return text

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def getFeatures(text, tokenizer, model, preprocessed = False, MAX_PAD_LENGTH = 210):
  """
  Input:
    text: input text
    tokenizer: word tokenizer for the text
    preprocessed: Whether the input data is already processed or not

  Output:
    top_k no. of labels along with their corresponding probabilities
  """
  if not preprocessed:
    text = text_preprocess(text)
  text_sequence = tokenizer.texts_to_sequences([text])
  text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
  output = model.predict(text_padded)
  output = output.flatten()
  # return the sigmoid of the output layer
  return 1 / (1 + np.exp(-output))


In [6]:
FILE_PATH = ""
MAX_PAD_LENGTH = 210

with open(FILE_PATH + 'tokenizer.pkl', 'rb') as handle:
  tokenizer = pickle.load(handle)


model_path = FILE_PATH + "model2.h5"
custom_objects = {"TransformerBlock": TransformerBlock,
                  "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
                  "HammingLoss" : tfa.metrics.HammingLoss(mode='multilabel')}
loaded_model = load_model(model_path, custom_objects=custom_objects)

# Create a new model that takes the input and outputs from the second last layer
second_last_layer_model = keras.Model(inputs=loaded_model.input, outputs=loaded_model.layers[-3].output)

In [7]:
second_last_layer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 210)]             0         
                                                                 
 token_and_position_embeddin  (None, 210, 32)          806720    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 210, 32)          10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [9]:
first_1_million_data = pd.read_csv(FILE_PATH + "first 1 million.csv")
print(first_1_million_data.info())
first_1_million_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   categories  999999 non-null  object
 1   text        999999 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB
None


Unnamed: 0,categories,text
0,hep-ph,calcul prompt diphoton product cross section t...
1,math.CO cs.CG,sparsitycertifi graph decomposit describ new a...
2,physics.gen-ph,evolut earthmoon system base dark matter field...
3,math.CO,determin stirl cycl number count unlabel acycl...
4,math.CA math.FA,dyadic lambdaalpha lambdaalpha paper show comp...


In [10]:
NUM = 0
text_data = first_1_million_data["text"][NUM*100_000:NUM*100_000+100_000]
training_mat_file = "training_matrix_" + str(NUM) + ".pkl"

training_matrix_0.pkl


In [11]:
NUM = 0
text_data = first_1_million_data["text"][NUM*100_000:NUM*100_000+100_000]
training_mat_file = "training_matrix_" + str(NUM) + ".pkl"

results = []
for text in tqdm(text_data):
  result = getFeatures(text, tokenizer=tokenizer, model=second_last_layer_model, preprocessed=True, MAX_PAD_LENGTH=210)
  results.append(result)

  0%|                                                | 0/100000 [00:00<?, ?it/s]



  0%|                                     | 1/100000 [00:00<10:44:14,  2.59it/s]



  0%|                                      | 3/100000 [00:00<4:04:54,  6.81it/s]



  0%|                                      | 5/100000 [00:00<2:48:11,  9.91it/s]



  0%|                                      | 7/100000 [00:00<2:18:49, 12.00it/s]



  0%|                                      | 9/100000 [00:00<2:04:10, 13.42it/s]



  0%|                                     | 11/100000 [00:00<1:55:55, 14.38it/s]



  0%|                                     | 13/100000 [00:01<1:56:24, 14.31it/s]



  0%|                                     | 15/100000 [00:01<1:52:56, 14.75it/s]



  0%|                                     | 17/100000 [00:01<1:55:05, 14.48it/s]



  0%|                                     | 19/100000 [00:01<1:55:51, 14.38it/s]



  0%|                                     | 21/100000 [00:01<1:53:04, 14.74it/s]



  0%|                                     | 23/100000 [00:01<1:50:31, 15.08it/s]



  0%|                                     | 25/100000 [00:01<1:52:54, 14.76it/s]



  0%|                                     | 27/100000 [00:02<1:49:43, 15.19it/s]



  0%|                                     | 29/100000 [00:02<1:45:08, 15.85it/s]



  0%|                                     | 31/100000 [00:02<1:42:23, 16.27it/s]



  0%|                                     | 33/100000 [00:02<1:40:25, 16.59it/s]



  0%|                                     | 35/100000 [00:02<1:38:49, 16.86it/s]



  0%|                                     | 37/100000 [00:02<1:36:45, 17.22it/s]



  0%|                                     | 39/100000 [00:02<1:37:30, 17.09it/s]



  0%|                                     | 41/100000 [00:02<1:42:19, 16.28it/s]



  0%|                                     | 43/100000 [00:03<1:42:04, 16.32it/s]



  0%|                                     | 45/100000 [00:03<1:39:03, 16.82it/s]



  0%|                                     | 47/100000 [00:03<1:40:04, 16.65it/s]



  0%|                                     | 49/100000 [00:03<1:43:29, 16.10it/s]



  0%|                                     | 51/100000 [00:03<1:43:32, 16.09it/s]



  0%|                                     | 53/100000 [00:03<1:46:08, 15.69it/s]



  0%|                                     | 55/100000 [00:03<1:42:00, 16.33it/s]



  0%|                                     | 57/100000 [00:03<1:42:05, 16.32it/s]



  0%|                                     | 59/100000 [00:04<1:45:31, 15.78it/s]



  0%|                                     | 61/100000 [00:04<1:47:16, 15.53it/s]



  0%|                                     | 63/100000 [00:04<1:45:04, 15.85it/s]



  0%|                                     | 65/100000 [00:04<1:41:21, 16.43it/s]



  0%|                                     | 67/100000 [00:04<1:38:09, 16.97it/s]



  0%|                                     | 69/100000 [00:04<1:37:48, 17.03it/s]



  0%|                                     | 71/100000 [00:04<1:36:36, 17.24it/s]



  0%|                                     | 73/100000 [00:04<1:36:45, 17.21it/s]



  0%|                                     | 75/100000 [00:04<1:39:17, 16.77it/s]



  0%|                                     | 77/100000 [00:05<1:37:23, 17.10it/s]



  0%|                                     | 79/100000 [00:05<1:38:53, 16.84it/s]



  0%|                                     | 81/100000 [00:05<1:36:58, 17.17it/s]



  0%|                                     | 83/100000 [00:05<1:38:27, 16.91it/s]



  0%|                                     | 85/100000 [00:05<1:38:21, 16.93it/s]



  0%|                                     | 87/100000 [00:05<1:36:41, 17.22it/s]



  0%|                                     | 89/100000 [00:05<1:37:30, 17.08it/s]



  0%|                                     | 91/100000 [00:05<1:35:45, 17.39it/s]



  0%|                                     | 93/100000 [00:05<1:34:21, 17.65it/s]



  0%|                                     | 95/100000 [00:06<1:35:54, 17.36it/s]



  0%|                                     | 97/100000 [00:06<1:36:00, 17.34it/s]



  0%|                                     | 99/100000 [00:06<1:41:26, 16.41it/s]



  0%|                                    | 101/100000 [00:06<1:39:08, 16.80it/s]



  0%|                                    | 103/100000 [00:06<1:37:34, 17.06it/s]



  0%|                                    | 105/100000 [00:06<1:36:27, 17.26it/s]



  0%|                                    | 107/100000 [00:06<1:36:21, 17.28it/s]



  0%|                                    | 109/100000 [00:06<1:36:33, 17.24it/s]



  0%|                                    | 111/100000 [00:07<1:36:03, 17.33it/s]



  0%|                                    | 113/100000 [00:07<1:35:26, 17.44it/s]



  0%|                                    | 115/100000 [00:07<1:34:13, 17.67it/s]



  0%|                                    | 117/100000 [00:07<1:34:21, 17.64it/s]



  0%|                                    | 119/100000 [00:07<1:35:17, 17.47it/s]



  0%|                                    | 121/100000 [00:07<1:33:56, 17.72it/s]



  0%|                                    | 123/100000 [00:07<1:33:15, 17.85it/s]



  0%|                                    | 125/100000 [00:07<1:34:12, 17.67it/s]



  0%|                                    | 127/100000 [00:07<1:45:48, 15.73it/s]



  0%|                                    | 129/100000 [00:08<1:45:30, 15.78it/s]



  0%|                                    | 131/100000 [00:08<1:43:02, 16.15it/s]



  0%|                                    | 133/100000 [00:08<1:39:21, 16.75it/s]



  0%|                                    | 135/100000 [00:08<1:37:08, 17.13it/s]



  0%|                                    | 137/100000 [00:08<1:42:52, 16.18it/s]



  0%|                                    | 139/100000 [00:08<1:50:32, 15.06it/s]



  0%|                                    | 141/100000 [00:08<1:46:21, 15.65it/s]



  0%|                                    | 143/100000 [00:08<1:42:16, 16.27it/s]



  0%|                                    | 145/100000 [00:09<1:39:03, 16.80it/s]



  0%|                                    | 147/100000 [00:09<1:38:58, 16.82it/s]



  0%|                                    | 149/100000 [00:09<1:36:04, 17.32it/s]



  0%|                                    | 151/100000 [00:09<1:34:44, 17.56it/s]



  0%|                                    | 153/100000 [00:09<1:33:11, 17.86it/s]



  0%|                                    | 155/100000 [00:09<1:32:40, 17.96it/s]



  0%|                                    | 157/100000 [00:09<1:33:10, 17.86it/s]



  0%|                                    | 159/100000 [00:09<1:32:13, 18.04it/s]



  0%|                                    | 161/100000 [00:09<1:31:36, 18.17it/s]



  0%|                                    | 163/100000 [00:10<1:30:57, 18.29it/s]



  0%|                                    | 165/100000 [00:10<1:37:01, 17.15it/s]



  0%|                                    | 167/100000 [00:10<1:39:31, 16.72it/s]



  0%|                                    | 169/100000 [00:10<1:38:37, 16.87it/s]



  0%|                                    | 171/100000 [00:10<1:36:50, 17.18it/s]



  0%|                                    | 173/100000 [00:10<1:39:00, 16.80it/s]



  0%|                                    | 175/100000 [00:10<1:39:32, 16.71it/s]



  0%|                                    | 177/100000 [00:10<1:42:17, 16.27it/s]



  0%|                                    | 179/100000 [00:11<1:41:35, 16.38it/s]



  0%|                                    | 181/100000 [00:11<1:40:41, 16.52it/s]



  0%|                                    | 183/100000 [00:11<1:40:55, 16.48it/s]



  0%|                                    | 185/100000 [00:11<1:42:07, 16.29it/s]



  0%|                                    | 187/100000 [00:11<1:45:21, 15.79it/s]



  0%|                                    | 189/100000 [00:11<1:41:21, 16.41it/s]



  0%|                                    | 191/100000 [00:11<1:39:23, 16.74it/s]



  0%|                                    | 193/100000 [00:11<1:41:59, 16.31it/s]



  0%|                                    | 195/100000 [00:12<1:45:00, 15.84it/s]



  0%|                                    | 197/100000 [00:12<1:41:46, 16.34it/s]



  0%|                                    | 199/100000 [00:12<1:39:33, 16.71it/s]



  0%|                                    | 201/100000 [00:12<1:37:57, 16.98it/s]



  0%|                                    | 203/100000 [00:12<1:39:04, 16.79it/s]



  0%|                                    | 205/100000 [00:12<1:36:52, 17.17it/s]



  0%|                                    | 207/100000 [00:12<1:36:39, 17.21it/s]



  0%|                                    | 209/100000 [00:12<1:34:45, 17.55it/s]



  0%|                                    | 211/100000 [00:12<1:35:23, 17.43it/s]



  0%|                                    | 213/100000 [00:13<1:37:43, 17.02it/s]



  0%|                                    | 215/100000 [00:13<1:35:56, 17.33it/s]



  0%|                                    | 217/100000 [00:13<1:35:45, 17.37it/s]



  0%|                                    | 219/100000 [00:13<1:38:20, 16.91it/s]



  0%|                                    | 221/100000 [00:13<1:45:17, 15.79it/s]



  0%|                                    | 223/100000 [00:13<1:45:22, 15.78it/s]



  0%|                                    | 225/100000 [00:13<1:43:37, 16.05it/s]



  0%|                                    | 227/100000 [00:13<1:41:59, 16.31it/s]



  0%|                                    | 229/100000 [00:14<1:40:05, 16.61it/s]



  0%|                                    | 231/100000 [00:14<1:39:54, 16.64it/s]



  0%|                                    | 233/100000 [00:14<1:40:20, 16.57it/s]



  0%|                                    | 235/100000 [00:14<1:38:37, 16.86it/s]



  0%|                                    | 237/100000 [00:14<1:39:29, 16.71it/s]



  0%|                                    | 239/100000 [00:14<1:37:16, 17.09it/s]



  0%|                                    | 241/100000 [00:14<1:36:07, 17.30it/s]



  0%|                                    | 243/100000 [00:14<1:37:51, 16.99it/s]



  0%|                                    | 245/100000 [00:14<1:36:36, 17.21it/s]



  0%|                                    | 247/100000 [00:15<1:35:34, 17.40it/s]



  0%|                                    | 249/100000 [00:15<1:34:06, 17.67it/s]



  0%|                                    | 251/100000 [00:15<1:36:31, 17.22it/s]



  0%|                                    | 253/100000 [00:15<1:36:45, 17.18it/s]



  0%|                                    | 255/100000 [00:15<1:35:56, 17.33it/s]



  0%|                                    | 257/100000 [00:15<1:35:21, 17.43it/s]



  0%|                                    | 259/100000 [00:15<1:34:50, 17.53it/s]



  0%|                                    | 261/100000 [00:15<1:35:26, 17.42it/s]



  0%|                                    | 263/100000 [00:16<1:36:23, 17.24it/s]



  0%|                                    | 265/100000 [00:16<1:35:54, 17.33it/s]



  0%|                                    | 267/100000 [00:16<1:36:09, 17.29it/s]



  0%|                                    | 269/100000 [00:16<1:35:08, 17.47it/s]



  0%|                                    | 271/100000 [00:16<1:37:35, 17.03it/s]



  0%|                                    | 273/100000 [00:16<1:37:29, 17.05it/s]



  0%|                                    | 275/100000 [00:16<1:38:26, 16.88it/s]



  0%|                                    | 277/100000 [00:16<1:35:25, 17.42it/s]



  0%|                                    | 279/100000 [00:16<1:35:36, 17.38it/s]



  0%|                                    | 281/100000 [00:17<1:40:12, 16.58it/s]



  0%|                                    | 283/100000 [00:17<1:45:33, 15.74it/s]



  0%|                                    | 285/100000 [00:17<1:43:07, 16.12it/s]



  0%|                                    | 287/100000 [00:17<1:40:55, 16.47it/s]



  0%|                                    | 289/100000 [00:17<1:39:23, 16.72it/s]



  0%|                                    | 291/100000 [00:17<1:41:00, 16.45it/s]



  0%|                                    | 293/100000 [00:17<2:02:38, 13.55it/s]



  0%|                                    | 295/100000 [00:18<2:12:46, 12.52it/s]



  0%|                                    | 297/100000 [00:18<2:02:02, 13.62it/s]



  0%|                                    | 299/100000 [00:18<1:54:38, 14.50it/s]



  0%|                                    | 301/100000 [00:18<1:47:30, 15.46it/s]



  0%|                                    | 303/100000 [00:18<1:42:37, 16.19it/s]



  0%|                                    | 305/100000 [00:18<1:43:45, 16.01it/s]



  0%|                                    | 307/100000 [00:18<1:43:10, 16.10it/s]



  0%|                                    | 309/100000 [00:18<1:44:07, 15.96it/s]



  0%|                                    | 311/100000 [00:19<1:42:08, 16.27it/s]



  0%|                                    | 313/100000 [00:19<1:40:08, 16.59it/s]



  0%|                                    | 315/100000 [00:19<1:38:50, 16.81it/s]



  0%|                                    | 317/100000 [00:19<1:41:58, 16.29it/s]


KeyboardInterrupt: 

In [None]:
resulting_matrix = np.vstack(training_matrix)
resulting_matrix.shape

In [53]:
with open(FILE_PATH + training_mat_file, 'wb') as handle:
  pickle.dump(resulting_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

(10, 256)