<a href="https://colab.research.google.com/github/Srijith216/Deep_Vision/blob/main/DeepVision_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('/content/drive/MyDrive/DeepVision/arxiv_data.csv')

In [4]:
data.shape

(51774, 3)

In [5]:
data.sample(5)

Unnamed: 0,titles,summaries,terms
943,Camera-trap images segmentation using multi-la...,The segmentation of animals from camera-trap i...,['cs.CV']
44626,Mining Spatio-temporal Data on Industrializati...,Despite the growing availability of big data i...,"['cs.CV', 'cs.IR']"
37346,From Known to Unknown: Knowledge-guided Transf...,Time series forecasting (TSF) is fundamentally...,['cs.LG']
51723,Molecular Structure Extraction From Documents ...,Chemical structure extraction from documents r...,"['cs.LG', 'physics.chem-ph']"
28940,PointHop: An Explainable Machine Learning Meth...,An explainable machine learning method for poi...,"['cs.CV', 'cs.LG']"


In [6]:
import nltk
import re
nltk.download(['punkt'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
train_set = pd.DataFrame()
train_set['titles'] = data['titles'][:10000]

In [8]:
train_set.head()

Unnamed: 0,titles
0,Survey on Semantic Stereo Matching / Semantic ...
1,FUTURE-AI: Guiding Principles and Consensus Re...
2,Enforcing Mutual Consistency of Hard Regions f...
3,Parameter Decoupling Strategy for Semi-supervi...
4,Background-Foreground Segmentation for Interio...


In [9]:
clean_text = []
for abstract in train_set['titles']:
  abstract = re.sub(r'[^a-zA-Z]', r' ',abstract)
  abstract = re.sub(r' +', r' ',abstract)
  abstract = abstract.strip().lower()
  clean_text.append(abstract)

In [10]:
train_set['clean_text'] = clean_text
train_set.head()

Unnamed: 0,titles,clean_text
0,Survey on Semantic Stereo Matching / Semantic ...,survey on semantic stereo matching semantic de...
1,FUTURE-AI: Guiding Principles and Consensus Re...,future ai guiding principles and consensus rec...
2,Enforcing Mutual Consistency of Hard Regions f...,enforcing mutual consistency of hard regions f...
3,Parameter Decoupling Strategy for Semi-supervi...,parameter decoupling strategy for semi supervi...
4,Background-Foreground Segmentation for Interio...,background foreground segmentation for interio...


In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_set['clean_text'])
total_words = len(tokenizer.word_index) + 1

In [13]:
print(f"Total words: {total_words}")

Total words: 7868


In [14]:
input_sequences = []
for line in train_set['clean_text']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print("Total input sequences: ", len(input_sequences))

Total input sequences:  86900


In [15]:
max_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_len, padding='pre'))

In [16]:
X,label = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(label,num_classes = total_words)

In [17]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 31, 100)           786800    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 7868)              2368268   
                                                                 
Total params: 3,456,268
Trainable params: 3,456,268
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
history = model.fit(X, y, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
def predictNextN(word,n):
  seed_text = word
  for _ in range(n):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    pred_x = model.predict(token_list)
    predicted = np.argmax(pred_x,axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
  return seed_text

In [20]:
def predictWithMiddle(word,n):
  seed_text = word
  for _ in range(n):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    pred_x = model.predict(token_list)
    predicted = np.argmax(pred_x,axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    seed_list = seed_text.split()
    seed_list = seed_list[::-1]
    seed_text = ' '.join(seed_list)
  return seed_text

In [21]:
def predictPrevN(word,n):
  og_word = word
  seed_text = word
  for _ in range(n):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    pred_x = model.predict(token_list)
    predicted = np.argmax(pred_x,axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    seed_list = seed_text.split()
    seed_list = seed_list[::-1]
    seed_list.remove(og_word)
    seed_text = ' '.join(seed_list)
    seed_text += " " + og_word
  return seed_text

In [22]:
predictNextN("attention",10)

'attention aware generative adversarial networks ata gans gans for self supervised'

In [23]:
predictWithMiddle("attention",10)

'manifolds on learning representation based attention aware domain adaptation for bayesian'

In [24]:
predictPrevN("attention",10)

'based based design efficient based aware for efficient neural model attention'

In [25]:
from google.colab import files

uploaded = files.upload()

Saving inputs.txt to inputs.txt


In [26]:
inputs = open("inputs.txt").read().split()
inputs

['space',
 'medical',
 'network',
 'segmentation',
 'mri',
 'neural',
 'survey',
 'speech',
 'mutual',
 'segmentation']

In [27]:
for inp in inputs:
  print(f"Word:{inp}")
  print(predictNextN(inp,10))
  print(predictWithMiddle(inp,10))
  print(predictPrevN(inp,10))
  print()

Word:space
space time neural irradiance fields for free viewpoint video object detection
visual for representations modal multi space time series with geometry embedding
on evaluation estimation object multi time fusion perception for estimation space

Word:medical
medical image segmentation based on multi modal convolutional neural network study
depth improve to classification image medical image synthesis using active learning
synthesis synthesis translation data image image image image representation against medical

Word:network
network agnostic knowledge transfer for medical image segmentation a survey and
training contrastive via learning representation network agnostic perceptual similarity for brain
in design selection for representation agnostic representation pruning via augmentation network

Word:segmentation
segmentation of microscopy data for finding nuclei in divergent images using
and images microscopy improve to segmentation of medical images using plant
for via and the

In [28]:
#tf.keras.backend.clear_session()