In [1]:
import random
import pickle

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [2]:
text4000 = pd.read_csv(".//Data/4000/4000-stories-VAD.csv")
texts= list(text4000.story.values)
# Texte mit Trennzeichen oder speziellem Wort kombinieren
trennzeichen = "trennzeichen"
joined_text = (" " + trennzeichen + " ").join(texts)
# Tokenizer erstellen und Texte darauf anwenden
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(joined_text.lower())
context_words = 10
input_words = []
next_words = []
words_limiter = 189000 # limitiert die Anzahl an Trainingsdaten
counter=0
i = 0
while i < len(tokens) - context_words:
    if tokens[i + context_words] == trennzeichen:
        i += context_words + 1
        counter+=1
        continue  # Eintrag überspringen, wenn das Trennzeichen erreicht wird
    
    input_words.append(tokens[i:i + context_words])
    next_words.append(tokens[i + context_words])
    
    if len(next_words) >= words_limiter:
        break
    
    i += 1

print(len(next_words))
print(counter)
# überprüfen ob alles ok mit dem Trennzeichen ist
if trennzeichen in tokens:
    print("Alles Gut. Das Trennzeichen ist in tokens enthalten.")
else:
    print("Fehler!!! Trennzeichen nicht in tokens!!!")
if trennzeichen in input_words:
    print("Fehler!!! Das Trennzeichen ist in input_words enthalten.!!!")
else:
    print("Alles Gut.")
if trennzeichen in next_words:
    print("Fehler!!! Das Trennzeichen ist in next_words enthalten.!!!")
else:
    print("Alles Gut.")

189000
197
Alles Gut. Das Trennzeichen ist in tokens enthalten.
Alles Gut.
Alles Gut.


In [3]:
# with open('FormatedData/200000/RawData/input_words.pickle', 'rb') as file:
#     input_words = pickle.load(file)
# with open('FormatedData/200000/RawData/next_words.pickle', 'rb') as file:
#     next_words = pickle.load(file)

In [4]:
# Konvertiere input_words in einen eindimensionalen Array von Strings
input_words_flat = np.concatenate(input_words).ravel()

# Kombiniere input_words_flat mit next_words
combined_array = np.concatenate((input_words_flat, next_words))

# Verwandle den kombinierten Array in einen einzelnen langen String
combined_string = ' '.join(combined_array)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([combined_string])

word_index = tokenizer.word_index
total_unique_words = len(tokenizer.word_index) + 1
print(total_unique_words)

13712


In [6]:
input_sequences = tokenizer.texts_to_sequences(input_words)
next_sequences = tokenizer.texts_to_sequences(next_words)

print("input_words:")
print(input_words[1])  # Beispiel für die Umwandlung des zweiten input_words

print("next_word:")
print(next_words[1])

print("Input Sequences:")
print(input_sequences[1])  # Beispiel für die Umwandlung des zweiten input_words

print("Next Sequences:")
print(next_sequences[1])  # Beispiel für die Umwandlung des zweiten target_word


input_words:
['i', 'if', 'you', 'don', 't', 'like', 'christmas', 'stories', 'don', 't']
next_word:
read
Input Sequences:
[9, 49, 13, 78, 30, 54, 284, 1754, 78, 30]
Next Sequences:
[335]


In [7]:
embedding_dim = 100  # Dimension der GloVe-Vektoren
embeddings_index = {}  # Dictionary für die GloVe-Vektoren

# Laden der GloVe-Daten
path = 'glove.6B/glove.6B.100d.txt'
with open(path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.array(values[1:], dtype=np.float32)
        embeddings_index[word] = coeffs

# Erstellen der embeddings_matrix
num_words = min(words_limiter, total_unique_words)  # Anzahl der eindeutigen Tokens, die verwendet werden
embeddings_matrix = np.zeros((num_words, embedding_dim))  # Initialisierung der Matrix mit Nullen

for word, i in word_index.items():
    if i >= words_limiter:
        continue  # Nur die ersten words_limiter eindeutigen Tokens verwenden

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector
    else:
        # Wenn das Wort nicht in den GloVe-Vektoren vorhanden ist, wird es mit zufälligen Werten initialisiert
        embeddings_matrix[i] = np.random.uniform(-0.25, 0.25, embedding_dim)
# embeddings_matrix = np.zeros((total_unique_words, 100))
# for word, i in word_index.items():
#    embedding_vector = embeddings_index.get(word)
#    if embedding_vector is not None:
#      embeddings_matrix[i] = embedding_vector;


In [8]:
model = Sequential()
model.add(Embedding(input_dim = total_unique_words, output_dim=100, weights=[embeddings_matrix], input_length=10, trainable=False))
model.add(LSTM(128, recurrent_dropout=0.5, dropout=0.5,return_sequences=True))
model.add(LSTM(128, recurrent_dropout=0.5, dropout=0.5))
model.add(Dense(total_unique_words, activation="softmax"))

In [9]:
split_index = int(len(input_sequences) * 0.95)
x_train, x_val = input_sequences[:split_index], input_sequences[split_index:]
y_train, y_val = next_sequences[:split_index], next_sequences[split_index:]

In [10]:
print(x_train[0])
print(y_train[0])
print(x_val[0])
print(y_val[0])

[396, 9, 49, 13, 78, 30, 54, 284, 1754, 78]
[30]
[9, 1732, 331, 16, 8, 233, 506, 2, 1716, 848]
[1057]


In [11]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(learning_rate=0.01), metrics=['accuracy'])
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=128, epochs=1, shuffle=True).history

 113/1403 [=>............................] - ETA: 1:40 - loss: 7.1881 - accuracy: 0.0533

KeyboardInterrupt: 

In [None]:
model.save('E_Model/Model.h5')
pickle.dump(history, open("Model2/history.p", "wb"))

In [None]:
with open('Model2/MetaData/context_words.pickle', 'wb') as file:
    pickle.dump(context_words, file)
with open('Model2/MetaData/unique_tokens.pickle', 'wb') as file:
    pickle.dump(unique_tokens, file)
with open('Model2/MetaData/unique_token_index.pickle', 'wb') as file:
    pickle.dump(unique_token_index, file)   

In [None]:
from matplotlib import pyplot as plt
model = load_model('Model2/Model.h5')
history = pickle.load(open("Model2/history.p", "rb"))

plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left') 

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')