# NLP - Basics

In [1]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


### Der Text, der durch das Netzwerk gelernt werden soll

In [2]:
data_phd = """Before explaing how bound states can be naturally embedded in the framework of the
S-matrix, we shall first of all derive a generic expression for the bound state in terms of
the microscopic degrees of freedom of the underlying theory which is independent of any
asymptotic construction. Indeed, when computing observables connected to the interior of
the black hole we will make heavy use of this non-asymptotic construction.
As explained before, at the kinematical level all quantum states are identified by their
quantum numbers. In particular, from the point of view of representing a bound state in
terms of Fock eigenstates constructed from the weakly coupled degrees of freedom appearing in the microscopic Lagrangian, it is clear that only those Fock states have non-vanishing
overlap with the bound state which carry the same quantum numbers as the latter. In
other words, these states should have quantum numbers in accordance with the intrinsic
symmetries at work (such as gauge symmetries), and with the isometries characterizing
bound states in Minkowski space–time. Furthermore, the state has to be characterized
according to the Casimir operators of Minkowski, i.e. mass squared and spin. Including all these quantum numbers, collectively denoted as L, leads to a complete kinematic
characterization of the bound state in question.
"""

### Konvertierung der Sätze in numerisches Format

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data_phd])
encoded = tokenizer.texts_to_sequences([data_phd])[0]

word_index = tokenizer.word_index
print(list(tokenizer.index_word.items())[:10])
print(list(tokenizer.index_word.items())[-10:])

[(1, 'the'), (2, 'of'), (3, 'in'), (4, 'bound'), (5, 'states'), (6, 'state'), (7, 'quantum'), (8, 'to'), (9, 'as'), (10, 'numbers')]
[(113, 'spin'), (114, 'including'), (115, 'collectively'), (116, 'denoted'), (117, 'l'), (118, 'leads'), (119, 'complete'), (120, 'kinematic'), (121, 'characterization'), (122, 'question')]


In [4]:
print(encoded)

[14, 34, 35, 4, 5, 36, 15, 37, 38, 3, 1, 39, 2, 1, 40, 41, 16, 42, 43, 2, 11, 44, 12, 45, 46, 47, 1, 4, 6, 3, 17, 2, 1, 18, 19, 2, 20, 2, 1, 48, 49, 21, 22, 50, 2, 51, 23, 24, 52, 53, 54, 55, 56, 8, 1, 57, 2, 1, 58, 59, 16, 60, 61, 62, 63, 2, 64, 25, 23, 24, 9, 65, 14, 26, 1, 66, 67, 11, 7, 5, 68, 69, 70, 71, 7, 10, 3, 72, 27, 1, 73, 2, 74, 2, 75, 12, 4, 6, 3, 17, 2, 28, 76, 77, 27, 1, 78, 79, 19, 2, 20, 80, 3, 1, 18, 81, 82, 22, 83, 84, 85, 86, 28, 5, 29, 25, 87, 88, 13, 1, 4, 6, 21, 89, 1, 90, 7, 10, 9, 1, 91, 3, 92, 93, 30, 5, 94, 29, 7, 10, 3, 95, 13, 1, 96, 31, 26, 97, 98, 9, 99, 31, 32, 13, 1, 100, 101, 4, 5, 3, 33, 102, 103, 1, 6, 104, 8, 15, 105, 106, 8, 1, 107, 108, 2, 33, 109, 110, 111, 112, 32, 113, 114, 11, 30, 7, 10, 115, 116, 9, 117, 118, 8, 12, 119, 120, 121, 2, 1, 4, 6, 3, 122]


### Da wir daran interessiert sind, aus einer Folge von beliebigen Wörtern aus unserem Text das Folgewort zu konstruieren, brauchen wir labels für alle Wörter im Vokabular. Daher benötigen wir also die Größe des Vokabulars. Diese wird auch in den ersten Layer unseres Netzwerkes eingehen. Die 1 wird nur addiert, um die später aufgerufene to_categorical Funktion zufriedenzustellen.

In [5]:
vocab_size = len(tokenizer.word_index) + 1
print('Anzahl an unterschiedlichen Wörtern: %d' % vocab_size)

Anzahl an unterschiedlichen Wörtern: 123


### Im nächsten Schritt konstruieren wir Teilsätze mit (in diesem Fall) 5 Wörtern. Dabei werden die ersten 4 Wörter jeweils die Rolle der Features und das letzte Wort die Rolle der vorherzusagenden Variable einnehmen.

In [6]:
# encode 3 words -> 1 word
sequences = list()
for i in range(4, len(encoded)):
    sequence = encoded[i-4:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length)
print(sequences[0])
print(sequences[1])
print('Max Sequence Length: %d' % max_length)

sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
y.shape

Total Sequences: 209
[14 34 35  4  5]
[34 35  4  5 36]
Max Sequence Length: 5


(209, 123)

In [7]:
# encode 3 words -> 1 word
sequences = list()
for i in range(4, len(encoded)):
    sequence = encoded[i-4:i+1]
    sequences.append(sequence)
print('Anzahl an Wörterfolgen der Länge 5: %d' % len(sequences))

sequences = pad_sequences(sequences, maxlen=5, padding='pre')
print(sequences[0])
print(sequences[1])

sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

Anzahl an Wörterfolgen der Länge 5: 209
[14 34 35  4  5]
[34 35  4  5 36]


### Nun konstruieren ein neuronales Netzwerk zur Vorhersage von Folgewörtern. Dieses besteht aus einem initialen Embedding Layer, der individuellen Wörtern Vektoren mit Gewichten zuordnet, die trainiert werden. Dieser Schicht folgen die rekurrenten Zellen (wir wählen Long-Short-Term-Memory Architekturen), also das "Gedächtnis" des Netzwerks. Zuletzt gibt es einen Dense Layer, der wie gewohnt auf den Output projiziert. 

In [15]:
from keras.models import Model
import keras.layers as L

def build_model():
    inp = L.Input(shape=(None,))
    emb = L.Embedding(vocab_size, 10)(inp)
    lstm = L.LSTM(30)(emb)
    
    model = Model(inp, lstm)
    return model
    

In [16]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 10)          1230      
_________________________________________________________________
lstm_3 (LSTM)                (None, 30)                4920      
Total params: 6,150
Trainable params: 6,150
Non-trainable params: 0
_________________________________________________________________


In [1]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(30))
mode.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=300, verbose=2)

NameError: name 'Sequential' is not defined

In [10]:
?Embedding

### Schließlich wollen wir unser Model testen. Dazu schreiben wir eine Methode, die anhand eines Inputs von 4 aufeinanderfolgenden Wörtern, sukzessive den Rest des Textes generiert.

In [None]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text

    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

# evaluate model
print(generate_seq(model, tokenizer, 4, 'how bound states can', 40))