## Loading Tools and Dataset

# Part 1

In [1]:
from tensorflow.keras import layers
from tensorflow import keras 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from ast import literal_eval
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../artifacts/raw/arxiv_data.csv")

In [3]:
new_df = pd.read_csv("../artifacts/raw/arxiv_data.csv")

In [4]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# Part 2 (Recommendation System)

In [5]:
df.drop(columns = ['terms', 'summaries'], inplace = True)

In [6]:
df

Unnamed: 0,titles
0,Survey on Semantic Stereo Matching / Semantic ...
1,FUTURE-AI: Guiding Principles and Consensus Re...
2,Enforcing Mutual Consistency of Hard Regions f...
3,Parameter Decoupling Strategy for Semi-supervi...
4,Background-Foreground Segmentation for Interio...
...,...
51769,Hierarchically-coupled hidden Markov models fo...
51770,Blinking Molecule Tracking
51771,Towards a Mathematical Foundation of Immunolog...
51772,A Semi-Automatic Graph-Based Approach for Dete...


### Provides pre trained models

In [7]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm





In [8]:
model =  SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
sentences = df['titles']

In [10]:
# embeddings = model.encode(sentences)

In [11]:
# embeddings

In [12]:
# c = 0
# for sentence, embedding in zip(sentences, embeddings):
#     print(f"Sentence : {sentence}\n")
#     print(f"Embedding : {len(embedding)}")
#     if c > 5:
#         break
#     c += 1

In [13]:
# import pickle

# with open("../artifacts/models/embeddings.pkl", 'wb') as f:
#     pickle.dump(embeddings,f)

# with open("../artifacts/models/sentences.pkl", 'wb') as f:
#     pickle.dump(sentences,f)

# with open("../artifacts/models/rec_model.pkl", 'wb') as f:
#     pickle.dump(model,f)

In [14]:
# embeddings = pickle.load(open("../artifacts/models/embeddings.pkl", 'rb'))
# sentences = pickle.load(open("../artifacts/models/sentences.pkl", 'rb'))
# rec_model = pickle.load(open("../artifacts/models/rec_model.pkl", 'rb'))

# Top 5 recommendations

In [15]:
# import torch

# def recommendation(input_paper):
#     # Encode input and compute cosine similarity
#     input_embedding = rec_model.encode(input_paper)
#     cosine_scores = util.cos_sim(embeddings, input_embedding).squeeze(1)
    
#     # Sort all papers by similarity (descending)
#     sorted_indices = torch.argsort(cosine_scores, descending=True)
    
#     # Collect up to 5 unique papers
#     papers_list = []
#     seen = set()
    
#     for idx in sorted_indices:
#         title = sentences[idx.item()]
#         if title not in seen:
#             papers_list.append(title)
#             seen.add(title)
#         if len(papers_list) == 5:
#             break
    
#     return papers_list


In [16]:
# # exampel usage 2: (use this paper as input (BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding))
# input_paper = input("Enter the title of any paper you like")
# recommend_papers = recommendation(input_paper)


# print("We recommend to read this paper............")
# print("=============================================")
# for paper in recommend_papers:
#     print(paper)

In [17]:
new_df.shape

(51774, 3)

In [18]:
new_df.isna().sum()

titles       0
summaries    0
terms        0
dtype: int64

In [19]:
new_df.duplicated().sum()

np.int64(12783)

In [20]:
labels_columns = new_df['terms'].apply(literal_eval)
labels = labels_columns.explode().unique()

In [21]:
labels

array(['cs.CV', 'cs.LG', 'cs.AI', ..., 'I.2.6; I.5.1; G.3',
       '92E10, 46M20, 94A08, 68U10, 44A12, 55R35', '92E10'],
      shape=(1099,), dtype=object)

In [22]:
new_df = new_df[~new_df['titles'].duplicated()]

In [23]:
new_df.shape

(38972, 3)

In [24]:
print(sum(new_df['terms'].value_counts() == 1))
print(new_df['terms'].nunique())

2321
3157


In [25]:
arxive_data_filtered = new_df.groupby ('terms').filter(lambda x : len(x) > 1)

In [26]:
arxive_data_filtered.shape

(36651, 3)

In [27]:
arxive_data = new_df

In [28]:
new_arxive_data_filtered = arxive_data_filtered 

In [29]:
arxive_data['terms'] = arxive_data_filtered['terms'].apply(lambda x : literal_eval(x))

In [30]:
arxive_data_filtered['terms'].values[:3]

array(["['cs.CV', 'cs.LG']", "['cs.CV', 'cs.AI', 'cs.LG']",
       "['cs.CV', 'cs.AI']"], dtype=object)

# Train Test Split

In [31]:
train_df, test_df = train_test_split(arxive_data_filtered, test_size = 0.1, stratify = arxive_data_filtered['terms'].values, )

In [32]:
train_df.shape

(32985, 3)

In [33]:
test_df.shape

(3666, 3)

In [38]:
val_df = test_df.sample(frac = 0.5)
test_df.drop(val_df.index, inplace = True)

In [39]:
val_df.shape

(1833, 3)

In [40]:
test_df.shape

(1833, 3)

In [41]:
train_df.shape

(32985, 3)

In [42]:
terms = tf.ragged.constant(train_df['terms'])

In [44]:
lookup = tf.keras.layers.StringLookup(output_mode = 'multi_hot')

In [46]:
lookup.adapt(terms)
vocab = lookup.get_vocabulary()

In [43]:
terms

<tf.Tensor: shape=(32985,), dtype=string, numpy=
array([b"['stat.ML', 'cs.CV', 'cs.LG']", b"['cs.CV']",
       b"['cs.LG', 'stat.ML']", ..., b"['cs.LG', 'stat.ML']",
       b"['cs.CV']", b"['cs.CV']"], shape=(32985,), dtype=object)>

In [48]:
sample_label = train_df['terms'].iloc[5]
print(sample_label)
labels_binarized = lookup([sample_label])
labels_binarized

['cs.CV']


<tf.Tensor: shape=(837,), dtype=int64, numpy=
array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     

In [49]:
max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

In [None]:
def make_dataset(dataframe : pd.DataFrame, is_Train : bool = True):
    labels = tf.ragged.constant(dataframe['terms'].values)