In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow-addons
!pip install Keras-Preprocessing

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/612.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m450.6/612.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.21.0 typeguard-2.13.3
Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages:

In [3]:
import pandas as pd
import pickle
import numpy as np

import nltk
import re
nltk.download("stopwords")
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
stop_word_collection = stopwords.words('english')
import string
from tqdm import tqdm

import tensorflow as tf
import keras
import tensorflow_addons as tfa
from keras.models import load_model
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import contextlib
import io

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [4]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [6]:
FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/"

MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/Model Data/"

MAX_PAD_LENGTH = 210

with open(FILE_PATH + 'tokenizer.pkl', 'rb') as handle:
  tokenizer = pickle.load(handle)

## Only For training purpose
# with open(FILE_PATH + '1_million_data_tokenized.pkl', 'rb') as handle:
#   text_padded = pickle.load(handle)

model_path = MODEL_PATH + "model2.h5"
custom_objects = {"TransformerBlock": TransformerBlock,
                  "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
                  "HammingLoss" : tfa.metrics.HammingLoss(mode='multilabel')}
loaded_model = load_model(model_path, custom_objects=custom_objects)

# Create a new model that takes the input and outputs from the second last layer
second_last_layer_model = keras.Model(inputs=loaded_model.input, outputs=loaded_model.layers[-3].output)
second_last_layer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 210)]             0         
                                                                 
 token_and_position_embeddin  (None, 210, 32)          806720    
 g_1 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_1 (Transf  (None, 210, 32)          10656     
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

## Recommender System Training Data

In [None]:
# %%time
# first_1_million_data = pd.read_csv(FILE_PATH + "first 1 million.csv")
# print(first_1_million_data.info())
# first_1_million_data.head()

In [None]:
# %%time
# X = first_1_million_data["text"]
# text_sequence = tokenizer.texts_to_sequences(X)
# text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")

In [None]:
type(text_padded), text_padded.shape

(numpy.ndarray, (999999, 210))

In [None]:
text_padded[0:10]

array([[ 103, 2535, 4041, ...,    0,    0,    0],
       [ 118,  819,  134, ...,    0,    0,    0],
       [ 198,    4,   19, ...,    0,    0,    0],
       ...,
       [ 139,   58,  822, ...,    0,    0,    0],
       [2111,  377,   51, ...,    0,    0,    0],
       [ 442, 2504,   27, ...,    0,    0,    0]], dtype=int32)

In [None]:
result = second_last_layer_model.predict(text_padded)



In [None]:
result.shape

(999999, 256)

In [None]:
with open(FILE_PATH + "training_matrix.pkl", 'wb') as handle:
  pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
# result[0]

## Building Recommendation System

In [27]:
PROJECT_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/"
DATABASE_PATH = "original_first_1_million.csv"
database = pd.read_csv(PROJECT_PATH + DATABASE_PATH)
print(database.info())
database.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          999999 non-null  float64
 1   submitter   999904 non-null  object 
 2   authors     999999 non-null  object 
 3   title       999999 non-null  object 
 4   comments    781295 non-null  object 
 5   doi         548198 non-null  object 
 6   abstract    999999 non-null  object 
 7   date        999999 non-null  object 
 8   categories  999999 non-null  object 
dtypes: float64(1), object(8)
memory usage: 68.7+ MB
None


Unnamed: 0,id,submitter,authors,title,comments,doi,abstract,date,categories
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version",10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,2008-11-26,hep-ph
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,"We describe a new algorithm, the $(k,\ell)$-...",2008-12-13,math.CO cs.CG
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,The evolution of Earth-Moon system is descri...,2008-01-13,physics.gen-ph
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,We show that a determinant of Stirling cycle...,2007-05-23,math.CO
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,,In this paper we show how to compute the $\L...,2013-10-15,math.CA math.FA


In [28]:
TRAINING_MAT_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/Recommender/training_matrix.pkl"
with open(TRAINING_MAT_PATH, 'rb') as handle:
  training_matrix = pickle.load(handle)

In [29]:
def text_preprocess(text):
  # Remove all punctuations
  text = ''.join(c for c in text if c not in string.punctuation)

  # Remove all numbers and words containing numbers
  text = re.sub(r'\w*\d\w*', ' ', text).strip()

  # Changes to lower case
  text = text.lower()

  # Remove all stop words
  text = ' '. join(word for word in text.split() if word not in stop_word_collection)

  # Stemming of all words
  text = [ps.stem(word) for word in text.split()]
  text = ' '.join(text)
  return text


def getFeatures(text, tokenizer, model, preprocessed = False, MAX_PAD_LENGTH = 210):
  """
  Input:
    text: input text
    tokenizer: word tokenizer for the text
    preprocessed: Whether the input data is already processed or not

  Output:
    top_k no. of labels along with their corresponding probabilities
  """
  if not preprocessed:
    text = text_preprocess(text)
  text_sequence = tokenizer.texts_to_sequences([text])
  text_padded = pad_sequences(text_sequence, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
  output = model.predict(text_padded)
  output = output.flatten()
  return output


def cosine_similarity(input_data, matrix):
    # Calculate the dot product between the input data and the matrix
    dot_product = np.dot(input_data, matrix.T)

    # Calculate the norms of the input data and each column of the matrix
    input_norm = np.linalg.norm(input_data)
    matrix_norms = np.linalg.norm(matrix, axis=1)

    # Calculate the cosine similarity using broadcasting
    similarity = dot_product / (input_norm * matrix_norms + 1e-10)  # Adding a small value to avoid division by zero
    return similarity



In [34]:
text = """Quantum secure metrology protocols harness quantum effects to probe remote systems with enhanced precision and security. Traditional QSM protocols require multi-partite entanglement, which limits its near-term implementation due to technological constraints. This paper proposes a QSM scheme that employs Bell pairs to provide unconditional security while offering precision scaling beyond the standard quantum limit. We provide a detailed comparative performance analysis of our proposal under multiple attacks. We found that the employed controlled encoding strategy is far better than the parallel encoding of multi-partite entangled states with regard to the secrecy of the parameter. We also identify and characterize an intrinsic trade-off relationship between the maximum achievable precision and security under the limited availability of resources. The dynamic scalability of the proposed protocol makes it suitable for large-scale network sensing scenarios."""

input_features = getFeatures(text, tokenizer=tokenizer, model=second_last_layer_model, preprocessed = False, MAX_PAD_LENGTH = 210)

similarity_scores = cosine_similarity(input_features, training_matrix)

print(similarity_scores)
k = 3
indices_of_top_k = np.argsort(-similarity_scores.flatten())[:k]
print(indices_of_top_k)
database.iloc[indices_of_top_k]


[0.38333166 0.3229128  0.26520443 ... 0.4952625  0.45745072 0.25854546]
[753675 657270 250449]


Unnamed: 0,id,submitter,authors,title,comments,doi,abstract,date,categories
753675,1607.06124,Claudia Benedetti,"D. Tamascelli, C. Benedetti, S. Olivares, M. G...",Characterization of qubit chains by Feynman pr...,"8 pages, 5 figures",10.1103/PhysRevA.94.042129,We address the characterization of qubit cha...,2016-11-17,quant-ph
657270,1509.02631,Fumitaka Nakamura,"Fumitaka Nakamura (NAOJ), Hideo Ogawa (Osaka P...",Z45: A New 45-GHz Band Dual-Polarization HEMT ...,"32 pages, 12 figures, accepted by PASJ",10.1093/pasj/psv088,We developed a dual-linear-polarization HEMT...,2015-10-14,astro-ph.IM astro-ph.GA astro-ph.SR
250449,1103.3181,Farhad Jafarpour Hamadani,"Vahid Fayaz, Farhad H. Jafarpour, Seyedeh Razi...",One-transit paths and steady-state of a non-eq...,"8 pages, 2 figures",10.1088/1742-5468/2010/12/P12009,We have shown that the partition function of...,2011-03-17,cond-mat.stat-mech cond-mat.dis-nn


In [35]:
text = """A review is provided here about the thermal effects on optical chirality. To this goal, chiral objects dispersed in an embedding fluid are examined for their magnetoelectric coupling. Thermal effects on several chiral meta-atoms and their ensembles are examined. To this goal, DNA-like helical structures are examined in detail. The mechanical aspect of thermo-elasticity is reviewed along with transverse deformations while drawing analogies from condensed-matter physics. In this respect, the chirality-induced spin selection is reviewed along with the temperature-mediated electron–phonon interactions. A wide range of materials, such as polymers and biological cells, are also examined for temperature effects. A transition temperature delineating a sign flip in the chirality parameter is identified as well. Chirality-associated functionalities such as ratchet motions, switching, and modulations are investigated for their respective thermal effects. Issues of fabricating chiral meta-atoms are also discussed.
"""
input_features = getFeatures(text, tokenizer=tokenizer, model=second_last_layer_model, preprocessed = False, MAX_PAD_LENGTH = 210)

similarity_scores = cosine_similarity(input_features, training_matrix)

print(similarity_scores)
k = 3
indices_of_top_k = np.argsort(-similarity_scores.flatten())[:k]
print(indices_of_top_k)
database.iloc[indices_of_top_k]

[0.53613585 0.1842106  0.6398513  ... 0.81467605 0.22573382 0.50131905]
[103796 516331 104652]


Unnamed: 0,id,submitter,authors,title,comments,doi,abstract,date,categories
103796,901.2554,Stanley Haan,"S.L. Haan, Z.S. Smith, K.N. Shomsky and P.W. P...",Electron Drift Directions in Strong-Field Doub...,"12 pages, 9 figures; submitted to J.Phys.B on ...",10.1088/0953-4075/42/13/134009,Longitudinal momentum spectra and electron d...,2015-05-13,physics.atom-ph
516331,1404.3278,Nuno Freitas,Nuno Freitas and Panagiotis Tsaknias,Criteria for p-ordinarity of families of ellip...,6 pages,,Let $K_i$ be a number field for all $i \in \...,2014-04-15,math.NT
104652,901.341,Yanwei Ma,"Yanpeng Qi, Xianping Zhang, Zhaoshun Gao, Zhiy...",Superconductivity of powder-in-tube Sr0.6K0.4F...,"15 pages, 6 figures",10.1016/j.physc.2009.03.008,Nb-sheathed Sr0.6K0.4Fe2As2 superconducting ...,2015-05-13,cond-mat.supr-con


In [36]:
text = """In recent years, there has been considerable interest in
graph structures arising in technological, sociological, and
scientific settings: computer networks (routers or autonomous
systems connected together); networks of users exchanging
e-mail or instant messages; citation networks and hyperlink
networks; social networks (who-trusts-whom, who-talks-towhom, and so forth); and countless more [24]. The study
of such networks has proceeded along two related tracks:
the measurement of large network datasets, and the development of random graph models that approximate the observed properties"""

input_features = getFeatures(text, tokenizer=tokenizer, model=second_last_layer_model, preprocessed = False, MAX_PAD_LENGTH = 210)

similarity_scores = cosine_similarity(input_features, training_matrix)

print(similarity_scores)
k = 3
indices_of_top_k = np.argsort(-similarity_scores.flatten())[:k]
print(indices_of_top_k)
database.iloc[indices_of_top_k]

[0.20142011 0.62319916 0.2776809  ... 0.27062237 0.24729116 0.10110712]
[305118 911103 400676]


Unnamed: 0,id,submitter,authors,title,comments,doi,abstract,date,categories
305118,1111.6882,Alexey Nikulov,"V. L. Gurtovoi, A. A. Burlakov, A. V. Nikulov,...",Multiple Current States of Two Phase-Coupled S...,"5 pages, 5 figures",10.1134/S1063776111140160,The states of two phase-coupled superconduct...,2015-06-03,cond-mat.supr-con
911103,1711.04093,Guangfeng Dong,Guangfeng Dong and Changjian Liu and Jiazhong ...,On the maximal saddle order of p:-q resonant s...,19 pages,,"In this paper, we obtain some estimations of...",2019-03-18,math.CA
400676,1301.4086,Gideon Schechtman,William B. Johnson and Gideon Schechtman,Subspaces of $L_p$ that embed into $L_p(\mu)$ ...,,,Enflo and Rosenthal proved that $\ell_p(\ale...,2013-01-18,math.FA


In [13]:
similarity_scores.shape

(999999,)

In [14]:
similarity_scores[0]

0.38333166

In [20]:
np.random.seed(42)  # For reproducibility
data = {
    'Column1': np.random.randint(1, 100, 10),
    'Column2': np.random.randn(10),
    'Column3': np.random.choice(['A', 'B', 'C'], 10)
}

# Create the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Column1,Column2,Column3
0,52,0.279041,B
1,93,1.010515,C
2,15,-0.580878,B
3,72,-0.52517,B
4,61,-0.57138,C
5,21,-0.924083,B
6,83,-2.612549,C
7,87,0.95037,C
8,75,0.816445,A
9,75,-1.523876,C


In [21]:
selected_rows = df.iloc[[1, 2, 5, 6]]


In [22]:
print(selected_rows)

   Column1   Column2 Column3
1       93  1.010515       C
2       15 -0.580878       B
5       21 -0.924083       B
6       83 -2.612549       C
