In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !pip install tensorflow-addons
# !pip install Keras-Preprocessing

In [3]:
import pandas as pd
import pickle
import numpy as np

import nltk
import re
nltk.download("stopwords")
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
stop_word_collection = stopwords.words('english')
import string
from tqdm import tqdm

import tensorflow as tf
import keras
import tensorflow_addons as tfa
from keras.models import load_model
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import contextlib
import io

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nirajan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-07-19 19:46:14.830309: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-19 19:46:15.041097: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-19 19:46:15.043394: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlo

In [4]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def getFeatures(text, tokenizer, model, preprocessed = False, MAX_PAD_LENGTH = 210):
    """
    Input:
    text: input text
    tokenizer: word tokenizer for the text
    preprocessed: Whether the input data is already processed or not

    Output:
    top_k no. of labels along with their corresponding probabilities
    """
    text_sequence = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
    output = model.predict(text_padded)
    output = output.flatten()
    # return the sigmoid of the output layer
    return 1 / (1 + np.exp(-output))


In [5]:
FILE_PATH = ""
MAX_PAD_LENGTH = 210

with open(FILE_PATH + 'tokenizer.pkl', 'rb') as handle:
  tokenizer = pickle.load(handle)

model_path = FILE_PATH + "model2.h5"
custom_objects = {"TransformerBlock": TransformerBlock,
                  "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
                  "HammingLoss" : tfa.metrics.HammingLoss(mode='multilabel')}
loaded_model = load_model(model_path, custom_objects=custom_objects)

# Create a new model that takes the input and outputs from the second last layer
second_last_layer_model = keras.Model(inputs=loaded_model.input, outputs=loaded_model.layers[-3].output)
second_last_layer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 210)]             0         
                                                                 
 token_and_position_embeddin  (None, 210, 32)          806720    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 210, 32)          10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [6]:
%%time
first_1_million_data = pd.read_csv(FILE_PATH + "first 1 million.csv")
print(first_1_million_data.info())
first_1_million_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   categories  999999 non-null  object
 1   text        999999 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB
None
CPU times: user 18.2 s, sys: 1.6 s, total: 19.8 s
Wall time: 19.8 s


Unnamed: 0,categories,text
0,hep-ph,calcul prompt diphoton product cross section t...
1,math.CO cs.CG,sparsitycertifi graph decomposit describ new a...
2,physics.gen-ph,evolut earthmoon system base dark matter field...
3,math.CO,determin stirl cycl number count unlabel acycl...
4,math.CA math.FA,dyadic lambdaalpha lambdaalpha paper show comp...


In [7]:
%%time
X = first_1_million_data["text"]
text_sequence = tokenizer.texts_to_sequences(X)
text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")

CPU times: user 3min 12s, sys: 5.03 s, total: 3min 18s
Wall time: 3min 18s


In [10]:
with open(FILE_PATH + "1_million_data_tokenized.pkl", 'wb') as handle:
  pickle.dump(text_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
result = second_last_layer_model.predict(text_padded)

2023-07-19 19:50:47.317613: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 839999160 exceeds 10% of free system memory.


  145/31250 [..............................] - ETA: 47:29

KeyboardInterrupt: 

In [None]:
training_matrix = np.array(results)
resulting_matrix = np.vstack(training_matrix)
resulting_matrix.shape

In [None]:
with open(FILE_PATH + training_mat_file, 'wb') as handle:
  pickle.dump(resulting_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)