In [1]:
#pip uninstall tensorflow keras tf_keras tensorflow-intel -y

In [2]:
#pip install tensorflow==2.12.0 keras==2.12.0


# Importing Libraries

In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow import keras
from sklearn.model_selection import train_test_split
from ast import literal_eval
import pandas as pd
import numpy as np

# Loading CSV File

In [4]:
arxiv_data = pd.read_csv("arxiv_data.csv")

In [5]:
arxiv_data.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# Data Cleaning and Preprocessing

In [6]:
arxiv_data.shape

(51774, 3)

In [7]:
arxiv_data.isnull().sum()

titles       0
summaries    0
terms        0
dtype: int64

In [8]:
arxiv_data.duplicated().sum()

12783

In [9]:
labels_column = arxiv_data['terms'].apply(literal_eval)
labels = labels_column.explode().unique()

In [10]:
print("labels :",labels)
print("Length of labels : " , len(labels))

labels : ['cs.CV' 'cs.LG' 'cs.AI' ... 'I.2.6; I.5.1; G.3'
 '92E10, 46M20, 94A08, 68U10, 44A12, 55R35' '92E10']
Length of labels :  1099


In [11]:
arxiv_data = arxiv_data[arxiv_data['titles'].duplicated()]

In [12]:
arxiv_data.shape

(12802, 3)

In [13]:
print(sum(arxiv_data['terms'].value_counts() == 1))
print(arxiv_data['terms'].nunique())


605
1038


In [14]:
arxiv_data_filtered = arxiv_data.groupby('terms').filter(lambda x : len(x) > 1)
arxiv_data_filtered.shape

(12197, 3)

In [15]:
arxiv_data_filtered['terms'] = arxiv_data_filtered['terms'].apply(lambda x: literal_eval(x))
arxiv_data_filtered['terms'].values[:3]

array([list(['stat.ML', 'cs.CV']), list(['cs.CV', 'cs.AI']),
       list(['cs.CV'])], dtype=object)

# Train Test Split

In [16]:
train_df , test_df = train_test_split(arxiv_data_filtered , test_size = 0.1 , stratify = arxiv_data_filtered['terms'].values ) 

In [17]:
train_df.shape , test_df.shape

((10977, 3), (1220, 3))

In [18]:
val_df = test_df.sample(frac = 0.5)
test_df.drop(val_df.index , inplace = True)

In [19]:
val_df.shape , test_df.shape

((610, 3), (610, 3))

In [20]:
train_df

Unnamed: 0,titles,summaries,terms
22157,Higher-Order Attribute-Enhancing Heterogeneous...,Graph neural networks (GNNs) have been widely ...,"[cs.LG, cs.SI]"
22315,Entity Context Graph: Learning Entity Represen...,Knowledge is captured in the form of entities ...,"[cs.LG, cs.CL, cs.IR]"
35369,Deep Generative Models with Learnable Knowledg...,The broad set of deep generative models (DGMs)...,"[cs.LG, cs.CL, cs.CV, stat.ML]"
38670,Spectral Temporal Graph Neural Network for Tra...,An effective understanding of the contextual e...,"[cs.CV, cs.AI, cs.LG, cs.RO]"
23646,"About Graph Degeneracy, Representation Learnin...",Graphs or networks are a very convenient way t...,"[cs.LG, stat.ML]"
...,...,...,...
45085,Unsupervised Image Noise Modeling with Self-Co...,Noise modeling lies in the heart of many image...,"[cs.CV, eess.IV]"
38732,Towards Efficient Cross-Modal Visual Textual R...,Cross-modal retrieval is an important function...,[cs.CV]
44452,Deep learning for time series classification,Time series analysis is a field of data scienc...,"[cs.LG, cs.AI, stat.ML]"
47776,Free-Lunch Saliency via Attention in Atari Agents,We propose a new approach to visualize salienc...,"[cs.LG, cs.AI, cs.CV]"


In [21]:
terms = tf.ragged.constant(train_df['terms'])
lookup = tf.keras.layers.StringLookup(output_mode = 'multi_hot')
lookup.adapt(terms)
vocab = lookup.get_vocabulary()

In [22]:
sample_label = train_df['terms'].iloc[0]
print(sample_label)
label_binarized = lookup([sample_label])
print(label_binarized)

['cs.LG', 'cs.SI']
tf.Tensor(
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(1, 159), dtype=float32)


In [23]:
max_seqlen = 150
batch_size = 120
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, is_train=True):
    # Convert terms column to a tensor that can be passed to the lookup layer
    terms_ragged = tf.ragged.constant(dataframe['terms'].tolist())
    
    # Apply the lookup transformation to each term
    label_binary = lookup(terms_ragged).numpy()  # Returns a multi-hot encoded matrix
    
    # Ensure the shapes align for abstracts and label_binary
    assert len(dataframe['summaries'].values) == label_binary.shape[0], "Mismatch in lengths of abstracts and labels"

    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((dataframe['summaries'].values, label_binary))
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset

    return dataset.batch(batch_size)
    

    
train_dataset = make_dataset(train_df , is_train = True)
val_dataset = make_dataset(val_df , is_train = False)
test_dataset = make_dataset(train_df , is_train = False)


print("Train dataset element spec:", train_dataset.element_spec)
print("Validation dataset element spec:", val_dataset.element_spec)
print("Test dataset element spec:", test_dataset.element_spec)


Train dataset element spec: (TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 159), dtype=tf.float32, name=None))
Validation dataset element spec: (TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 159), dtype=tf.float32, name=None))
Test dataset element spec: (TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 159), dtype=tf.float32, name=None))


In [24]:

def invert_multi_hot(encoded_labels):
    hot_indeces = np.argwhere(encoded_labels == 1.0)[... , 0]
    return np.take(vocab , hot_indeces)

text_batch , label_batch = next(iter(train_dataset))
for i , text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None , ...]
    print(text)
    print(invert_multi_hot(label[0]))

tf.Tensor(b'Modern data acquisition routinely produce massive amounts of event sequence\ndata in various domains, such as social media, healthcare, and financial\nmarkets. These data often exhibit complicated short-term and long-term temporal\ndependencies. However, most of the existing recurrent neural network based\npoint process models fail to capture such dependencies, and yield unreliable\nprediction performance. To address this issue, we propose a Transformer Hawkes\nProcess (THP) model, which leverages the self-attention mechanism to capture\nlong-term dependencies and meanwhile enjoys computational efficiency. Numerical\nexperiments on various datasets show that THP outperforms existing models in\nterms of both likelihood and event prediction accuracy by a notable margin.\nMoreover, THP is quite general and can incorporate additional structural\nknowledge. We provide a concrete example, where THP achieves improved\nprediction performance for learning multiple point processes wh

In [25]:
vocabulary = set()
train_df['summaries'].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)

# Text Vectorization

In [26]:
text_vectorizer = layers.TextVectorization(max_tokens = vocabulary_size , ngrams = 2 , output_mode = 'tf_idf')
text_vectorizer.adapt(train_dataset.map(lambda text , label : text))

In [27]:
num_records = tf.data.experimental.cardinality(train_dataset).numpy()
print("Number of records in the training dataset:", num_records)


Number of records in the training dataset: 92


In [28]:
# Check the structure and type of the dataset before mapping
for text_sample, label_sample in train_dataset.take(1):
    print("Text sample type:", type(text_sample))
    print("Label sample type:", type(label_sample))
    print("Text sample:", text_sample.numpy())
    print("Label sample:", label_sample.numpy())

# Apply the text_vectorizer mapping
train_dataset = train_dataset.map(
    lambda features, label: (features, label),
    num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)


# Verify the mapping by checking one batch
for text_batch, label_batch in train_dataset.take(1):
    print("Text batch shape after mapping:", text_batch.shape)
    print("Label batch shape after mapping:", label_batch.shape)


Text sample type: <class 'tensorflow.python.framework.ops.EagerTensor'>
Label sample type: <class 'tensorflow.python.framework.ops.EagerTensor'>
Text sample: [b'The richness in the content of various information networks such as social\nnetworks and communication networks provides the unprecedented potential for\nlearning high-quality expressive representations without external supervision.\nThis paper investigates how to preserve and extract the abundant information\nfrom graph-structured data into embedding space in an unsupervised manner. To\nthis end, we propose a novel concept, Graphical Mutual Information (GMI), to\nmeasure the correlation between input graphs and high-level hidden\nrepresentations. GMI generalizes the idea of conventional mutual information\ncomputations from vector space to the graph domain where measuring mutual\ninformation from two aspects of node features and topological structure is\nindispensable. GMI exhibits several benefits: First, it is invariant to t

# Model Training

In [29]:
for text_batch, label_batch in train_dataset.take(1):
    print("Text sample type:", type(text_batch))
    print("Text sample shape:", text_batch.shape)
    print("Text sample content:", text_batch[0].numpy())


Text sample type: <class 'tensorflow.python.framework.ops.EagerTensor'>
Text sample shape: (120,)
Text sample content: b'Financial markets are a source of non-stationary multidimensional time series\nwhich has been drawing attention for decades. Each financial instrument has its\nspecific changing over time properties, making their analysis a complex task.\nImprovement of understanding and development of methods for financial time\nseries analysis is essential for successful operation on financial markets. In\nthis study we propose a volume-based data pre-processing method for making\nfinancial time series more suitable for machine learning pipelines. We use a\nstatistical approach for assessing the performance of the method. Namely, we\nformally state the hypotheses, set up associated classification tasks, compute\neffect sizes with confidence intervals, and run statistical tests to validate\nthe hypotheses. We additionally assess the trading performance of the proposed\nmethod on his

In [30]:
for x, y in train_dataset.take(1):
    print("Label shape:", y.shape)

Label shape: (120, 159)


In [31]:
import tensorflow as tf
from keras import layers, models
from keras.callbacks import EarlyStopping

# Vectorize text input
text_vectorizer = layers.TextVectorization(max_tokens=20000, ngrams=2, output_mode='tf_idf')
text_vectorizer.adapt(train_dataset.map(lambda x, y: x))  # Adapt to the training dataset

vocab_size = len(text_vectorizer.get_vocabulary())
embedding_dim = 128  # Choose an appropriate embedding dimension

# Ensure model architecture matches input shape expectations
model_1 = models.Sequential([
    text_vectorizer,  # Preprocess text input
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=None),
    layers.GlobalAveragePooling1D(),  # Convert 2D tensor to 1D
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(159, activation='sigmoid')  # Adjust output layer for multi-label classification
])

model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

# Ensure labels remain in their original shape (batch_size, 159)
es = EarlyStopping(patience=5, restore_best_weights=True)
history = model_1.fit(train_dataset, validation_data=val_dataset, epochs=20, callbacks=[es])


Epoch 1/20
 6/92 [>.............................] - ETA: 2:21 - loss: 0.6818 - binary_accuracy: 0.6844

KeyboardInterrupt: 

In [32]:
#model_1.save('Models/model_1', save_format='tf')
import matplotlib.pyplot as plt

def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("binary_accuracy")

NameError: name 'history' is not defined

In [33]:
import pickle
saved_text_vectorizer_config = text_vectorizer.get_config()
with open('Models/text_vectorizer_config.pkl' , 'wb') as f:
    pickle.dump(saved_text_vectorizer_config , f)

with open('Models/vocab.pkl','wb') as f:
    pickle.dump(vocab , f)

In [34]:
import pickle
from tensorflow.keras.layers import TextVectorization

# Assuming `text_vectorizer` is already adapted
text_vectorizer = TextVectorization()
text_vectorizer.adapt(train_dataset.map(lambda x, y: x))

# Save the configuration
with open('Models/text_vectorizer_config.pkl', 'wb') as f:
    pickle.dump(text_vectorizer.get_config(), f)

# Save the weights (vocabulary)
with open('Models/text_vectorizer_weights.pkl', 'wb') as f:
    pickle.dump(text_vectorizer.get_vocabulary(), f)

print("Configuration and weights saved successfully")


Configuration and weights saved successfully


In [39]:
import pickle
from tensorflow.keras.layers import TextVectorization

# Load the configuration
with open('Models/text_vectorizer_config.pkl', 'rb') as f:
    loaded_config = pickle.load(f)

# Create a new TextVectorization layer from the config
loaded_text_vectorizer = TextVectorization.from_config(loaded_config)

# Load the vocabulary and set it in the new layer
with open('Models/text_vectorizer_weights.pkl', 'rb') as f:
    loaded_vocab = pickle.load(f)
    loaded_text_vectorizer.set_vocabulary(loaded_vocab)

print("TextVectorization layer loaded successfully with the vocabulary")
loaded_model = keras.models.load_model("Models/model_1")

TextVectorization layer loaded successfully with the vocabulary


In [36]:
_ , acc1 = model_1.evaluate(test_dataset)
_ , acc1 = model_1.evaluate(val_dataset)



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    try:
        hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
        if hot_indices.size == 0:
            print("Warning: No active indices found in encoded labels.")
        return np.take(loaded_vocab, hot_indices)
    except IndexError as e:
        print(f"Index error during label lookup: {e}")
        return ['[UNK]']  # Return a placeholder for unexpected issues.

def predict_category(abstract, model, vectorizer, label_lookup):
    try:
        # Ensure the input is a list containing the abstract string
        preprocessed_abstract = [abstract]
        print("Step 1 - Shape of abstract:", np.shape(preprocessed_abstract))

        # Pass the input directly to the model without calling vectorizer outside of it
        preprocessed_abstract = tf.convert_to_tensor(preprocessed_abstract)
        print("Step 2 - Shape of preprocessed_abstract:", preprocessed_abstract.shape)

        # Make predictions using the loaded model (model should handle vectorization)
        predictions = loaded_model.predict(preprocessed_abstract)
        print("Shape of predictions:", predictions.shape)

        # Convert predictions to human-readable labels
        predicted_labels = label_lookup(np.round(predictions).astype(int)[0])

        return predicted_labels
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Example usage
new_abstract = "Graph neural networks (GNNs) have been widely used to learn vector representation of graph-structured data and achieved better task performance than conventional methods..."
predicted_categories = predict_category(new_abstract, model_1, text_vectorizer, invert_multi_hot)
print("Predicted Categories:", predicted_categories)


Step 1 - Shape of abstract: (1,)
Step 2 - Shape of preprocessed_abstract: (1,)
Shape of predictions: (1, 159)
Predicted Categories: ['the' 'of']


# LLM Installation

In [7]:
pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip show sentence-transformers

Name: sentence-transformers
Version: 3.2.1
Summary: State-of-the-Art Text Embeddings
Home-page: 
Author: 
Author-email: Nils Reimers <info@nils-reimers.de>, Tom Aarsen <tom.aarsen@huggingface.co>
License: Apache 2.0
Location: C:\Users\VARSHITH\AppData\Roaming\Python\Python311\site-packages
Requires: huggingface-hub, Pillow, scikit-learn, scipy, torch, tqdm, transformers
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install --upgrade jupyterlab ipywidgets


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:
from sentence_transformers import SentenceTransformer , util
from tqdm.autonotebook import tqdm


  from tqdm.autonotebook import tqdm, trange


In [11]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [13]:
sentences = arxiv_data['titles']

In [14]:
embeddings = model.encode(sentences)

In [15]:
embeddings.shape

(56181, 384)

In [16]:
c = 0

for sentence , embedding in zip(sentences , embeddings):
    print("Sentence : " , sentence)
    print("Embeddings : " , len(embedding))
    c = c + 1
    if c > 5:
        break;



Sentence :  Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities
Embeddings :  384
Sentence :  Decision Forests vs. Deep Networks: Conceptual Similarities and Empirical Differences at Small Sample Sizes
Embeddings :  384
Sentence :  Power up! Robust Graph Convolutional Network via Graph Powering
Embeddings :  384
Sentence :  Releasing Graph Neural Networks with Differential Privacy Guarantees
Embeddings :  384
Sentence :  Recurrence-Aware Long-Term Cognitive Network for Explainable Pattern Classification
Embeddings :  384
Sentence :  Lifelong Graph Learning
Embeddings :  384


In [17]:
import pickle

with open("embeddings.pkl" , 'wb') as f:
    pickle.dump(embeddings , f)

with open("sentences.pkl" , 'wb') as f:
    pickle.dump(sentences , f)

with open("Trained_model.pkl" , 'wb') as f:
    pickle.dump(model , f)

# Reccomendation System

In [18]:
import pickle

embeddings = pickle.load(open("embeddings.pkl" , 'rb'))
tences = pickle.load(open("sentences.pkl" , 'rb'))
rec_model = pickle.load(open("Trained_model.pkl" , 'rb'))



In [19]:
import torch

def recommendation(input_paper):
    input_embedding = rec_model.encode(input_paper)
    cosine_scores = util.cos_sim(embeddings , input_embedding)
    #k = min(5, cosine_scores.size(0))
    top_similar_papers = torch.topk(cosine_scores , dim = 0 , k = 8 , sorted = True)

    paper_list = set()
    for i in top_similar_papers.indices:
        paper_list.add(tences[i.item()])

    return paper_list

tences

0        Multi-Level Attention Pooling for Graph Neural...
1        Decision Forests vs. Deep Networks: Conceptual...
2        Power up! Robust Graph Convolutional Network v...
3        Releasing Graph Neural Networks with Different...
4        Recurrence-Aware Long-Term Cognitive Network f...
                               ...                        
56176    Mining Spatio-temporal Data on Industrializati...
56177    Wav2Letter: an End-to-End ConvNet-based Speech...
56178    Deep Reinforcement Learning with Double Q-lear...
56179                          Generalized Low Rank Models
56180    Chi-square Tests Driven Method for Learning th...
Name: titles, Length: 56181, dtype: object

In [21]:
input_paper = input("Enter the title of the Research paper")

Enter the title of the Research paper Lifelong Graph Learning


In [22]:
recommend_paper = recommendation(input_paper)
print("We recommend to read this paper ................ \n")
for paper in sorted(recommend_paper):
    print(paper)

We recommend to read this paper ................ 

An Uncoupled Training Architecture for Large Graph Learning
Graph Learning with Loss-Guided Training
Graph-Based Continual Learning
Lifelong Graph Learning
Lifelong Learning of Graph Neural Networks for Open-World Node Classification
