In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow-addons
!pip install Keras-Preprocessing

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/612.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.21.0 typeguard-2.13.3
Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [3]:
import pandas as pd
import pickle
import numpy as np

import nltk
import re
nltk.download("stopwords")
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
stop_word_collection = stopwords.words('english')
import string
from tqdm import tqdm

import tensorflow as tf
import keras
import tensorflow_addons as tfa
from keras.models import load_model
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import contextlib
import io

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [4]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [5]:
FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/"
MAX_PAD_LENGTH = 210

with open(FILE_PATH + 'tokenizer.pkl', 'rb') as handle:
  tokenizer = pickle.load(handle)

with open(FILE_PATH + '1_million_data_tokenized.pkl', 'rb') as handle:
  text_padded = pickle.load(handle)

model_path = FILE_PATH + "model2.h5"
custom_objects = {"TransformerBlock": TransformerBlock,
                  "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
                  "HammingLoss" : tfa.metrics.HammingLoss(mode='multilabel')}
loaded_model = load_model(model_path, custom_objects=custom_objects)

# Create a new model that takes the input and outputs from the second last layer
second_last_layer_model = keras.Model(inputs=loaded_model.input, outputs=loaded_model.layers[-3].output)
second_last_layer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 210)]             0         
                                                                 
 token_and_position_embeddin  (None, 210, 32)          806720    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 210, 32)          10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [7]:
# %%time
# first_1_million_data = pd.read_csv(FILE_PATH + "first 1 million.csv")
# print(first_1_million_data.info())
# first_1_million_data.head()

In [8]:
# %%time
# X = first_1_million_data["text"]
# text_sequence = tokenizer.texts_to_sequences(X)
# text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")

In [9]:
type(text_padded), text_padded.shape

(numpy.ndarray, (999999, 210))

In [10]:
text_padded[0:10]

array([[ 103, 2535, 4041, ...,    0,    0,    0],
       [ 118,  819,  134, ...,    0,    0,    0],
       [ 198,    4,   19, ...,    0,    0,    0],
       ...,
       [ 139,   58,  822, ...,    0,    0,    0],
       [2111,  377,   51, ...,    0,    0,    0],
       [ 442, 2504,   27, ...,    0,    0,    0]], dtype=int32)

In [11]:
result = second_last_layer_model.predict(text_padded)



In [12]:
result.shape

(999999, 256)

In [13]:
with open(FILE_PATH + "training_matrix.pkl", 'wb') as handle:
  pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
result[0]

array([7.0664340e-01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       1.5395887e+00, 6.2673742e-01, 4.0908137e-01, 0.0000000e+00,
       0.0000000e+00, 5.6090295e-01, 0.0000000e+00, 3.4681901e-01,
       1.9824886e+00, 0.0000000e+00, 1.6002525e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 8.2170612e-01, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.9631212e+00, 0.0000000e+00,
       2.5378770e-01, 0.0000000e+00, 4.4007987e-01, 7.2128522e-01,
       9.3962312e-01, 4.4063973e-01, 6.2789693e-02, 0.0000000e+00,
       0.0000000e+00, 1.3168695e+00, 0.0000000e+00, 7.5801082e-02,
       0.0000000e+00, 7.5998110e-01, 2.7897589e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       2.2241051e-01, 5.5359441e-01, 0.0000000e+00, 1.2722372e+00,
       1.9652365e+00, 8.4403473e-01, 0.0000000e+00, 0.0000000e+00,
       4.0458524e-01, 4.3253946e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 1.0995998e+00, 8.7778348e-01, 5.1723379e