## 1. import and Load dataset

In [1]:
import json
import re
import string
import itertools
import numpy as np

In [2]:
vocab = [n for n in string.printable]
vocab_size = len(vocab)
char2idx = {v:idx for idx,v in enumerate(vocab)}
idx2char= np.array(vocab)

In [3]:
gen_json = (json.loads(line) for line in itertools.islice(open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json','r'),10))
temp = []
for line in gen_json:
  temp.append(line['abstract'])

In [4]:
temp[0]

'  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection of events.\n'

In [5]:
num_examples = 100000
gen_json = (json.loads(line) for line in itertools.islice(open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json','r'),num_examples))
abs_list = []
for line in gen_json:
  abs = line['abstract']
  abs = re.sub(r'(\S)\s+(\S)',r'\1 \2',abs).replace('\n','\n\n')
  abs = abs.replace('â\x80\x99',"'")
  abs = abs.replace('\x7f',"")
  abs = abs.replace('â\x88\x9e',"'")
  abs = abs.replace('â\x89¤',"'")
  abs = abs.replace('â\x80\x94',"'")
  abs = abs.replace('â\x80\x93',"-")
  for k in abs:
    abs_list.append(char2idx[k])

abs_list = np.array(abs_list)

In [6]:
abs_list.shape

(80067818,)

## 3. Define the model

In [7]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense

In [8]:
seq_len = 150
batch_size = 256
dataset = tf.data.Dataset.from_tensor_slices(abs_list)
dataset = dataset.batch(seq_len+1,drop_remainder=True)
dataset = dataset.map(lambda x: (x[:-1],x[1:]))
dataset = dataset.shuffle(1000).batch(batch_size,drop_remainder=True)

In [9]:
dataset

<BatchDataset shapes: ((256, 150), (256, 150)), types: (tf.int64, tf.int64)>

In [10]:
def make_model(vocabulary_size,embedding_dimension,rnn_units,batch_size,stateful):
  model = Sequential()
  model.add(Embedding(vocabulary_size,embedding_dimension,
                      batch_input_shape=[batch_size,None]))
  model.add(GRU(rnn_units,return_sequences=True,stateful=stateful))
  model.add(GRU(rnn_units,return_sequences=True,stateful=stateful))
  model.add(Dense(vocabulary_size))
  model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='adam',metrics=['accuracy'])
  model.summary()
  return model

In [11]:
emb_dim = 256
rnn_units = 1024
model = make_model(vocab_size,emb_dim,rnn_units,batch_size,False)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (256, None, 256)          25600     
_________________________________________________________________
gru (GRU)                    (256, None, 1024)         3938304   
_________________________________________________________________
gru_1 (GRU)                  (256, None, 1024)         6297600   
_________________________________________________________________
dense (Dense)                (256, None, 100)          102500    
Total params: 10,364,004
Trainable params: 10,364,004
Non-trainable params: 0
_________________________________________________________________


## 4. Train the model

In [12]:
checkpoint_dir = '/kaggle/temp/'
checkpoint_prefix = os.path.join(checkpoint_dir,'chkpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_prefix,
                                                         save_weights_only=True)

In [13]:
model.fit(dataset,epochs=10,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7c58a6d49ed0>

In [14]:
tf.train.latest_checkpoint(checkpoint_dir)

'/kaggle/temp/chkpt_10'

In [15]:
model = make_model(vocab_size,emb_dim,rnn_units,1,True)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            25600     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           6297600   
_________________________________________________________________
dense_1 (Dense)              (1, None, 100)            102500    
Total params: 10,364,004
Trainable params: 10,364,004
Non-trainable params: 0
_________________________________________________________________


## 5. Inference

## 5.1 Generate Text and Analysis

In [16]:
def generate_text(model,seed,num_characters):
  seed_text = tf.expand_dims([char2idx[k] for k in seed],0)
  generated_text = []
  model.reset_states()
  for n in range(num_characters+1):
    result = tf.random.categorical(model(seed_text)[0,-1:],num_samples=1)
    result = result[0,0].numpy()
    generated_text.append(result)
    seed_text = tf.expand_dims([result],0)
  return ''.join(idx2char[generated_text])

In [17]:
# Can the model guess obvious characters/words?
seed = ("This is a short string to test if the model knows what the next character sh")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

ould appear in the mock sample observations or discs. We derive the rotational rate and 2

sigma_v af
ould be incoherent and transverse forms. In particular we show that the mentioned $\kappa$-conceustio
ould be discussed, which includes SN Ihagims, 14 counter's 'physical' understanding of protein operat
ould improve from underdiption of the electronic structure and intermittency. In particular, we evalu
ares a before the full distribution of the information control. On the other hand, just an order of m


In [18]:
# Can it guess obvious characters/words?
seed = ("This is a text about Quantum Field ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

Overhauser fields, which are applicable to all their similarities with CdW with arbitrarily fixed tra
Wall states (DSWs) types, multiplied by one and two-loop type formalism to ball apply this test to a 
Theory (RCMT). In the second quasi twisted spin-I model, the mean-field range can be observed, as a f
Theories using the pseudoconstant-coefficient kernel under the Liouville equation contrarely covered 
strength rather than the imposed beam energy.

  It has not been under distance between the astroid s


In [19]:
# Can it guess obvious characters/words?
seed = ("Einstein introduced the theory of General ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

Relativity approach, weak to an entry pattern of microstructure cavity to be effective representation
Relativity VeckoVs (DGCs, quantum-Monte Carlo) calculations of the correlation band and at x-rappinic
Relative Simon quantum geometry $W_1$ and $B_F$ in the three-dimensional link Pax segment. Several cr
Relativity which almost Z = 2/8 extra dimensions and thus permits the first partitionetoist of better
Relativity and a term predicted in the canonical countermets. We show that this in both theories show


In [20]:
# Can it guess obvious characters/words?
seed = ("The results of the experiment indicate that the null hypothesis can be ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

understood by applying scattering mechanism in isolationar to Repcohtarine in the brain. When the con
very undising that the in-medised energy are calculated. At high temperatures, assuming this scatteri
satisfulsed as a sequence of spin Hall effect, in particular that the averaged flow of waves escaled 
executed by the first point-probe program.

  The chiral model leads to a phase transition parameter 
extended to various orders of magnitude in the interaction between them. Two new examples and mathema


In [21]:
# Can it guess obvious characters/words?
seed = ("The gradient of the potential is computed by taking the derivative of the field with ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

a vector meson-baryon density (where g=1) and its power \cite{BRK}1 is a fundamental stationary state
a growth of internal (>0.4 RM). The topology of the MDI in a general conventional analysis and applic
an invariant Killing tensor. We maintine how single-layer readout systems has less strengthening the 
the diameter, and which cancel becomes observed compared to events. We conclude that direct measureme
a discrete geometry of three or more sequences of p-percolations? As an additional scheme the develop


In [22]:
# A short seed text, not much context given
seed = ("Quantum field theory is the framework that describes physics at the " 
        "miscroscopic scale. In this paper, we ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

phenomenology of low energy effective cardical states with one supercharacter as strongly five challe
present a new procedure to change the probability of small values of the Higgs field changes as Kondo
focus on the metabolic model in which we can also be applied for quantum electrodynamics (GEMD), corr
extend these applications, two-state Monte Carlis methods based on the MLE likelihood) and (certain c
consider a two spinning plasma per quanta (2D) and compare it to the one without ROS-biology.

  We p


In [23]:
# Medium-sized seed text, some context given
seed = ("Quantum field theory is the framework that describes physics at the " 
        "miscroscopic scale. General relativity, on the other hand, describes "
        "dynamics of gravity across large distances. These two theories form "
        "the state-of-the-art of human understanding of the universe. However, "
        "the Standard Model and Einstein gravity are not compatible at the "
        "quantum level. In this paper, we ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

analyse its predictive austraints. Remarkably, we find a fine theoretical property has a set of lower
develop a new device configuration in sex-ray and short-range adipolar orthodox: Minformal Neutane, G
give a detailed difference between Dunc equations, in this case, we propose an analysis sharpeh techn
compare and calculate the peptide of the interfacear system to extend the volume and high Tc. We also
extend this formalism to study quantum computation. It is shown that there are states of coalescing b


In [24]:
# Actual abstract from 2011.02926 minus the last sentence.
seed = ("We consider two fundamental long-standing problems in quantum "
        "chromodynamics (QCD): the origin of color confinement and structure "
        "of a true vacuum and color singlet quantum states. There is a common "
        "belief that resolution to these problems needs a knowledge of a "
        "strict non-perturbative quantum Yang-Mills theory and new ideas. "
        "Our principal idea in resolving these problems is that structure of "
        "color confinement and color singlet quantum states must be determined "
        "by a Weyl symmetry which is an intrinsic symmetry of the Yang-Mills "
        "gauge theory, and by properties of a selected class of solutions "
        "satisfying special requirements. Following this idea we construct for "
        "the first time a space of color singlet one particle quantum states "
        "for primary colorless gluons and quarks and reveal the structure of "
        "color confinement in quantum Yang-Mills theory. "
        "As an application we demonstrate ")
        # "formation of physical observables in a pure QCD, pure glueballs.")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

an effective artificial phase diagram with a dispersive environment between two subsearche' between t
in the time dependence of which we show how the outcoupled Nodiographic (MC) conjugate special wave p
that this analogy is very small correlation with the system mass ratio <= 1. Despite the generation o
that this non-Gaussian equivalence test is not convenient, proving auxiliary strategies and  how some
our estimates and densities satisfied by these results.

  Pherication and spectral and local structu


In [25]:
# Unrelated string
seed = ("This string does not have to do with science at all, it's a text "
        "about baby shark. Did you know baby shark is the most watched video "
        "on Youtube as of November 2020? That is insane, this is because ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

binding on the temperature and (2) non-zero temperatures will enable only one ferroelectric coupling 
variations with random complex networks is generated by orders of rod set of sequences in leaf-less s
if the system is chosen for the IP channel, one are showing that at least some orthosemplands are rel
it can be used to identify and compare their results and obtain results about Portage's lower bounds 
super stars tend to faster Statistics in merger shocks when the kinematic motion of ~1.4 loo signals 


In [26]:
# Text in another language. The characters have structure (word and sentences) 
# but the model has never seen these combinations before
seed = ("Este texto esta escrito en otro idioma que el modelo no ha aprendido, "
        "veamos que pasa si recibe estas palabras raras que no existen en "
        "ingles. Lo que el modelo predice es que ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

to their inner small transfer data is proposed. The measurement method was established for measured e
to the fact that peculiar volumes can be avoided by that there is no zero knot over a

simple isometr
adjoint and isotropic dissipation scheduling in stage-only control spreadsheet-space structure involv
accurately, within the submm enverse entangled discretization is easy to use the planetary eciment wi
and, no measurement is postulated on the related original generalisations that provide a

threefold, 


In [27]:
# Gibberish
seed = ("m0n923h lxnaefpw;'[kdawpe_dlen;a[ak[k] [';jd0389hufw")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

eighz, Reggeiem(195)Ga(sed) Both Hamiltonian approximation calculations we compiled with Bayesian tec
ame, the Reesti Eu), e.g., Sommerfect hinge of inequalities which are propled that (involving vacanci
s]),i] R^3, there is a CAT-295 manomagnetic analogue of Alfvenic, Coupled Decrean liquid semiclassica
}] of the moduli space of a

thirdinal set $M$. We then show that all 'classical results are generali
-.ergin)] have the most relevant scale invariant unitary statistics. Simulators and momentum des and 


In [28]:
# Does it know how to use question marks?
seed = ("Can naked singularities exist")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

, that is, metastable and stochastic wave.

  We consider the asymmetries with the new geometry entan
, and compare these systems in a closed solution. For small time' logarithms and long periods of redu
 in an external magnetic field.

  The Circumsoludia object spanning a range for h_2 (> 1) dSph/MgIr3
 within about KK gauge fields. If the unknown entanglement outside the unmagnetized structure as in t
 in the GHZ quark potential.

  Gravity of SO to Leader Spectra (SPA), HST was 98.0 per year and W_H8


In [29]:
# OOV test
seed = ("This string contains a character out of vocabulary, the character "
        "ñ, what will the model do? It predicts the following, ")
for k in range(5):
  temp = generate_text(model,seed,100)
  print(temp)

KeyError: 'ñ'

To finish, let it predict a long text and watch it make less and less sense

In [None]:
# Actual abstract from 2011.02926 minus the last sentence.
seed = ("We consider two fundamental long-standing problems in quantum "
        "chromodynamics (QCD): the origin of color confinement and structure "
        "of a true vacuum and color singlet quantum states. There is a common "
        "belief that resolution to these problems needs a knowledge of a "
        "strict non-perturbative quantum Yang-Mills theory and new ideas. "
        "Our principal idea in resolving these problems is that structure of "
        "color confinement and color singlet quantum states must be determined "
        "by a Weyl symmetry which is an intrinsic symmetry of the Yang-Mills "
        "gauge theory, and by properties of a selected class of solutions "
        "satisfying special requirements. Following this idea we construct for "
        "the first time a space of color singlet one particle quantum states "
        "for primary colorless gluons and quarks and reveal the structure of "
        "color confinement in quantum Yang-Mills theory. "
        "As an application we demonstrate ")
        # "formation of physical observables in a pure QCD, pure glueballs.")
temp = generate_text(model,seed,1000)

In [None]:
import textwrap
print(textwrap.fill(temp,80))