## 1. import and Load dataset

In [1]:
import json
import re
import string
import itertools
import numpy as np

In [2]:
vocab = [n for n in string.printable]
vocab_size = len(vocab)
char2idx = {v:idx for idx,v in enumerate(vocab)}
idx2char= np.array(vocab)

In [3]:
gen_json = (json.loads(line) for line in itertools.islice(open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json','r'),10))
temp = []
for line in gen_json:
  temp.append(line['abstract'])

In [4]:
temp[0]

'  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection of events.\n'

## 2. Pre-process the data

In [5]:
num_examples = 100000
gen_json = (json.loads(line) for line in itertools.islice(open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json','r'),num_examples))
abs_list = []
for line in gen_json:
  abs = line['abstract']
  abs = re.sub(r'(\S)\s+(\S)',r'\1 \2',abs).replace('\n','\n\n')
  abs = abs.replace('â\x80\x99',"'")
  abs = abs.replace('\x7f',"")
  abs = abs.replace('â\x88\x9e',"'")
  abs = abs.replace('â\x89¤',"'")
  abs = abs.replace('â\x80\x94',"'")
  abs = abs.replace('â\x80\x93',"-")
  for k in abs:
    abs_list.append(char2idx[k])

abs_list = np.array(abs_list)

In [6]:
abs_list

array([94, 94, 36, ..., 75, 96, 96])

In [7]:
abs_list.shape

(80067818,)

## 3. Define the model

In [8]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense

In [9]:
seq_len = 150
batch_size = 256
dataset = tf.data.Dataset.from_tensor_slices(abs_list)
dataset = dataset.batch(seq_len+1,drop_remainder=True)
dataset = dataset.map(lambda x: (x[:-1],x[1:]))
dataset = dataset.shuffle(1000).batch(batch_size,drop_remainder=True)

In [10]:
dataset

<BatchDataset shapes: ((256, 150), (256, 150)), types: (tf.int64, tf.int64)>

In [11]:
def make_model(vocabulary_size,embedding_dimension,rnn_units,batch_size,stateful):
  model = Sequential()
  model.add(Embedding(vocabulary_size,embedding_dimension,
                      batch_input_shape=[batch_size,None]))
  model.add(LSTM(rnn_units,return_sequences=True,stateful=stateful))
  model.add(LSTM(rnn_units,return_sequences=True,stateful=stateful))
  model.add(Dense(vocabulary_size))
  model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='adam',metrics=['accuracy'])
  model.summary()
  return model

In [12]:
emb_dim = 256
rnn_units = 1024
model = make_model(vocab_size,emb_dim,rnn_units,batch_size,False)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (256, None, 256)          25600     
_________________________________________________________________
lstm (LSTM)                  (256, None, 1024)         5246976   
_________________________________________________________________
lstm_1 (LSTM)                (256, None, 1024)         8392704   
_________________________________________________________________
dense (Dense)                (256, None, 100)          102500    
Total params: 13,767,780
Trainable params: 13,767,780
Non-trainable params: 0
_________________________________________________________________


## 4. Train the model

In [13]:
checkpoint_dir = '/kaggle/temp/'
checkpoint_prefix = os.path.join(checkpoint_dir,'chkpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_prefix,
                                                         save_weights_only=True)

In [14]:
model.fit(dataset,epochs=10,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x771aeb242990>

In [15]:
tf.train.latest_checkpoint(checkpoint_dir)

'/kaggle/temp/chkpt_10'

In [16]:
model = make_model(vocab_size,emb_dim,rnn_units,1,True)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            25600     
_________________________________________________________________
lstm_2 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 1024)           8392704   
_________________________________________________________________
dense_1 (Dense)              (1, None, 100)            102500    
Total params: 13,767,780
Trainable params: 13,767,780
Non-trainable params: 0
_________________________________________________________________


## 5. Inference

## 5.1. Generating text

In [17]:
def generate_text(model,seed,num_characters):
  seed_text = tf.expand_dims([char2idx[k] for k in seed],0)
  generated_text = []
  model.reset_states()
  for n in range(num_characters+1):
    result = tf.random.categorical(model(seed_text)[0,-1:],num_samples=1)
    result = result[0,0].numpy()
    generated_text.append(result)
    seed_text = tf.expand_dims([result],0)
  return ''.join(idx2char[generated_text])

In [18]:
# Can the model guess obvious characters/words?
seed = ("This is a short string to test if the model knows what the next character sh")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

ould have a non-trivial feature in 4He and M7. Coordinates on high, very large (string tensions) can 
ould be such as s-wave components isolation coefficients. It is shown that the renoidal line flux dis
ould be violated. It is singlet on the cohomological Bianchi identity suitable for appropriate entang
ould be related against the Nu 327 scattered phase. In comparison to smaller values of the inhomogene
owers on the evolution of one and the model, institutionalisticity of the phase transition and result


In [19]:
# Can it guess obvious characters/words?
seed = ("This is a text about Quantum Field ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

Theory (TQQ), which is very closed as 1-body approach in the precise approximation. The second one ha
Isothermal Scalar Theories. We analyze the effect of the mass give rise to a

quark interaction. The 
Theories based upon the inflationary dynamics derived from both complex and action-based algorithms. 
Theory within mass shifts therein. The generalized SU(3) approach uses the size of the related BCS th
Theory (QC) when the overlap fermion bias for the corresponding H-hologrability lambda is given. Mode


In [20]:
# Can it guess obvious characters/words?
seed = ("Einstein introduced the theory of General ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

Relativity (GR) by comparing the solutions of the Einstein-Maxwell equations Coulomb gas. The reactio
Relativity as well as the arising specifications. Moreover, we used a new numerical method of finding
Relativity (GR), a mystery model for galaxy brow holes can be identified to be catalogoed by distribu
Relativity. Recently a loop correlation between the effective gravitino parameters for the Wilkinson 
Relativity (GR) in the redshift open bofts and show that a considerable interest is the mechanism in 


In [21]:
# Can it guess obvious characters/words?
seed = ("The results of the experiment indicate that the null hypothesis can be ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

imposed to the one-dimensional optical parameter for cosmic structure formation by cooling substrates
pattern in the development of mechanisms that undergo thermal compact super learning by Clay can be m
made just mentioned above. As a consequence, the general distribution of the circuit collaboration an
extended to negative fields, while the continuum dynamics provides a suitable react for $\mathcal A_g
found in a more general quantum critical point is impossible in the quasi-(multiplet) direction. Even


In [22]:
# Can it guess obvious characters/words?
seed = ("The gradient of the potential is computed by taking the derivative of the field with ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

single temperature coupling to the non-equilibrium fluid and in polarons. The approach has been used 
specific inflation energy, in attempt to find the strength of the fluxes of power Ising vectors on th
an Efimov equation. On the context of computation, the simple framework for the Bell inequalities wit
the height-dependent time. We use the linear operators to those for the ion description with the regr
positive pi-pulsation depth transition, and we observe that $\Delta_\phi/\mu$ vanishes (being the sol


In [23]:
# A short seed text, not much context given
seed = ("Quantum field theory is the framework that describes physics at the " 
        "miscroscopic scale. In this paper, we ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

explore precise maps in LQC and the effective higher-dimensional theories with gauge-single-layer gra
propose that the today in the primary mutant and on its cosmos? he found, and more and new research u
present the results of a recent theory for the single particle optics model and the consideration of 
investigate the properties of the theory in the axial vertex NLK Fermi liquid where the cross section
focus on the presence of quantum degenerate bosons with linear dilaton and to a non-homogeneous equiv


In [24]:
# Medium-sized seed text, some context given
seed = ("Quantum field theory is the framework that describes physics at the " 
        "miscroscopic scale. General relativity, on the other hand, describes "
        "dynamics of gravity across large distances. These two theories form "
        "the state-of-the-art of human understanding of the universe. However, "
        "the Standard Model and Einstein gravity are not compatible at the "
        "quantum level. In this paper, we ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

propose a very simple and accurate-analytic expression based on avail observables of the mass generat
show that the time-dependent Watson process measured in SU(2) lattice system reduces the equivalent-D
picture the physical parameters of these consecutive quasars continue to model the information criter
investigate estimates of the left handed numerical solutions of a fading asymmetry of the curvaton.


prove the Dzyaloshinsky Theorem to show that SUSY is always ferromagnetic (and "hail lattice) propert


In [25]:
# Actual abstract from 2011.02926 minus the last sentence.
seed = ("We consider two fundamental long-standing problems in quantum "
        "chromodynamics (QCD): the origin of color confinement and structure "
        "of a true vacuum and color singlet quantum states. There is a common "
        "belief that resolution to these problems needs a knowledge of a "
        "strict non-perturbative quantum Yang-Mills theory and new ideas. "
        "Our principal idea in resolving these problems is that structure of "
        "color confinement and color singlet quantum states must be determined "
        "by a Weyl symmetry which is an intrinsic symmetry of the Yang-Mills "
        "gauge theory, and by properties of a selected class of solutions "
        "satisfying special requirements. Following this idea we construct for "
        "the first time a space of color singlet one particle quantum states "
        "for primary colorless gluons and quarks and reveal the structure of "
        "color confinement in quantum Yang-Mills theory. "
        "As an application we demonstrate ")
        # "formation of physical observables in a pure QCD, pure glueballs.")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

a recent work of Bonn . For worldvoll scalar masses and atomic disturbances, we define a toy model wh
the restriction that a bidirectional state has attracted long-standing information such as effective 
that possible mechanisms of the electromagnetic fields in these cores of chaos containing microscopic
that the effect of spinless leptophobic nucleon numbers for photons interacting into the quark-gluon 
how statistical features reduce to the Nambu-Goto approximation (TBE) as a function of the Hubble rel


In [26]:
# Unrelated string
seed = ("This string does not have to do with science at all, it's a text "
        "about baby shark. Did you know baby shark is the most watched video "
        "on Youtube as of November 2020? That is insane, this is because ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

the restrictions immediately follows stochastically modelighth, while for cardiac/nimpinite policy, t
of its neighbourhood or conditional simultaneous-diffusion network effects. In this work, we introduc
there are data priorities to develop an interaction strategy of 98.5% with the DPL-axis exactly. Base
it does in the field of measurements and other practical new practities of scientific tools, to obtai
the variance variables are optimal. Based on the cut-off limits, we introduced some important fightin


In [27]:
# Text in another language. The characters have structure (word and sentences) 
# but the model has never seen these combinations before
seed = ("Este texto esta escrito en otro idioma que el modelo no ha aprendido, "
        "veamos que pasa si recibe estas palabras raras que no existen en "
        "ingles. Lo que el modelo predice es que ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

lies Co^-1 (\omega(c)--1\ell+n - pl+1/2\cho p|(|B^{-1\leq k})^{-1,1} then u(x-1)=b \lambda(K_{a+1)} x
podes a proof la covers'' of then earlier software uses more recently analysis of the tempore extract
tight for \beta(s) ~ S_z\to\intrux(x,y).+\infty < o(n-t))>0}, (2\times \pi(0,\pi0)E\} where $\phi(x)$
in considered a Rydberg or what it was the construction of quantum structures too remain directly fro
certaines analytiquess undesiate posture and sums: one of Swissiness and Christ propositilils" by R.\


In [28]:
# Gibberish
seed = ("m0n923h lxnaefpw;'[kdawpe_dlen;a[ak[k] [';jd0389hufw")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

ov] +0865876 (2008), xi3736 sd and P5-84. We identify 50 galaxies at z=0.435 and z=2.229, mizicirium 
0]

  Pico-type structure using a long-hoc tight current in the vibrational moduli of the underlying 
mer.org/oSpectical/IRAC-43], using identically defined [R. Phys. Phys. B 66, 166105 (2006)], whose ro
ilobent consistent with Kolmogorov coincide with the original results of Moree.

  In the work we tak
ek].5716ux.1756. BnK-BOAM/CRD images and simulated spectra as a function of environment; constant bra


In [29]:
# Does it know how to use question marks?
seed = ("Can naked singularities exist")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

. Agreement is found for light absorbers of HCN, simy-line, expansions, and linewied CMB modulation s
 with a quasi-local correlation between the mean and the nonequilibrium classes of full gapped select
ing in any medium in paramagnetic axis. A simple and unified method with general mean-field treatment
. This protocol appears to be experiencing a new solution. Generally, we examine efficient two-parame
 for the Burgers equation and bounded symmetries for the iterated problem.

  In reverberation we obt


In [30]:
# Actual abstract from 2011.02926 minus the last sentence.
seed = ("We consider two fundamental long-standing problems in quantum "
        "chromodynamics (QCD): the origin of color confinement and structure "
        "of a true vacuum and color singlet quantum states. There is a common "
        "belief that resolution to these problems needs a knowledge of a "
        "strict non-perturbative quantum Yang-Mills theory and new ideas. "
        "Our principal idea in resolving these problems is that structure of "
        "color confinement and color singlet quantum states must be determined "
        "by a Weyl symmetry which is an intrinsic symmetry of the Yang-Mills "
        "gauge theory, and by properties of a selected class of solutions "
        "satisfying special requirements. Following this idea we construct for "
        "the first time a space of color singlet one particle quantum states "
        "for primary colorless gluons and quarks and reveal the structure of "
        "color confinement in quantum Yang-Mills theory. "
        "As an application we demonstrate ")
        # "formation of physical observables in a pure QCD, pure glueballs.")
temp = generate_text(model,seed,1000)

In [31]:
import textwrap
print(textwrap.fill(temp,80))

that, although the excess phase interacts rapidly that a long distance bipolar
outflow is simplified, the index of all the symmetry associated to the
collisional boson Hushninskys operator of any photon wavelength appears in the
avalanche properties of a frequency-independent scrature.    We study the
spinons and chemical potentials of molecules maps of a 'FeII envelope maskless
ring. These solitons together with temperature between them are determined by
the temperature interaction. When there are much too weak reshities. This is the
first case in oscillators in which they can be antiverlear.    The measurement
of light and neutral partons status for the growth of neutrino effects is
explored in this work. We expect the exotic-0 asymmetry of the uncoupled
resonatorrass in BABs containing two of the substantial nuclear-mass generator
and the 3sigma plates in Deuterio tiltes and derive the rates of the Two Theorem
of ICTs. Also a novel nonlinear dynamical scaling architecture allowing t

In [32]:
# OOV test
seed = ("This string contains a character out of vocabulary, the character "
        "ñ, what will the model do? It predicts the following, ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

KeyError: 'ñ'