In [1]:
# Code created using ChatGPT-4, no promises it works correctly

import glob
from lxml import etree
from nltk.tokenize import sent_tokenize, word_tokenize

def read_and_tokenize_polish_corpus(corpus_folder):
    tokenized_sentences = []

    # Iterate over all XML files in the corpus folder
    for file_path in glob.glob(f"{corpus_folder}/**/*.xml", recursive=True):
        # Parse the XML file
        tree = etree.parse(file_path)

        # Retrieve the XML namespaces and register a default namespace
        namespaces = tree.getroot().nsmap.copy()
        namespaces['default'] = namespaces.pop(None)

        # Extract all text content from the <ab> elements
        abs = tree.xpath("//default:ab", namespaces=namespaces)
        for ab in abs:
            text = " ".join(ab.xpath(".//text()"))

            # Tokenize sentences and words
            if text:
                for sentence in sent_tokenize(text, language="polish"):
                    tokenized_sentences.append(word_tokenize(sentence, language="polish"))

    return tokenized_sentences

polish_tokenized_sentences = read_and_tokenize_polish_corpus("polish_corpus")

In [3]:
import os
import xml.etree.ElementTree as ET

def parse_ccl(xml_string):
    root = ET.fromstring(xml_string)
    sentences = []

    for chunk in root.findall('chunk'):
        for sentence in chunk.findall('sentence'):
            sentence_text = ''

            for tok in sentence.findall('tok'):
                orth = tok.find('orth').text
                sentence_text += orth + ' '

            sentences.append(sentence_text.strip())

    return sentences


def process_directory(path):
    sentencesList = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.xml'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as xml_file:
                    xml_string = xml_file.read()
                    sentences = parse_ccl(xml_string)
                    for sentence in sentences:
                        sentencesList.append(sentence)
    return sentencesList



def read_and_tokenize_polish_corpus_kpwr(corpus_folder):

    tokenized_sentences = []

    sentences = process_directory(corpus_folder)
    for sentence in sentences:
      tokenized_sentences.append(word_tokenize(sentence, language="polish"))

    return tokenized_sentences

corpus_folder = './polish_corpus_kpwr'
kpwr_tokenized_sentences = read_and_tokenize_polish_corpus_kpwr(corpus_folder)

In [2]:
import pickle

def save_tokenized_sentences(tokenized_sentences, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(tokenized_sentences, file)

def load_tokenized_sentences(file_path):
    with open(file_path, 'rb') as file:
        tokenized_sentences = pickle.load(file)
    return tokenized_sentences

In [11]:
save_tokenized_sentences(polish_tokenized_sentences, './polish_corpus/polish_tokenized_sentences.pkl')
save_tokenized_sentences(kpwr_tokenized_sentences, './polish_corpus/kpwr_tokenized_sentences.pkl')

tokenized_sentences = []
tokenized_sentences.extend(polish_tokenized_sentences)
tokenized_sentences.extend(kpwr_tokenized_sentences)
save_tokenized_sentences(tokenized_sentences, './polish_corpus/tokenized_sentences.pkl')

In [3]:
import pickle
tokenized_sentences = load_tokenized_sentences('./polish_corpus/tokenized_sentences.pkl')

In [14]:
import string

def preprocess_text(sentences):
    table = str.maketrans("", "", string.punctuation) # removing punctuation - it has to be a conscious decision
    tokenized_sentences = [[word.lower().translate(table) for word in sentence] for sentence in sentences]
    new_tokenized_sentences = []
    for sentence in tokenized_sentences:
        new_sentence = []
        for x in sentence:
            if x != '':
                new_sentence.append(x)
        new_tokenized_sentences.append(' '.join(new_sentence))

    removed_empty_sentences = []
    for sentence in new_tokenized_sentences:
        if len(sentence) > 1:
            removed_empty_sentences.append(sentence)

    return removed_empty_sentences

preprocessed_tokenized_sentences = preprocess_text(tokenized_sentences)

In [15]:
len(preprocessed_tokenized_sentences)

110438

In [16]:
# Word-level generation and prediction
# Toy-problem, not using the previous corpuses
# Import necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input
from gensim.models import Word2Vec

# Load the Word2Vec model and text corpus data
word2vec_model = Word2Vec.load("word2vec_large_model.model")
# corpus = ['This is the second sentence to train the model', 'This is the first sentence', 'This is the third sentence just because']
corpus = preprocessed_tokenized_sentences
embedding_dim = 100


def extract_target_words(corpus):
    target_words = []
    for sentence in corpus:
        words = sentence.split()
        target_word = words[-1]  # Get the last word in the sentence
        target_words.append(target_word)
    return target_words

target_words = extract_target_words(corpus)

# Create target_vectors
target_vectors = np.zeros((len(target_words), embedding_dim))

for idx, word in enumerate(target_words):
    try:
        target_vectors[idx] = word2vec_model.wv[word]
    except KeyError:
        target_vectors[idx] = np.random.rand(embedding_dim)  # initialize randomly

# Transform each word in the corpus into a corresponding vector representation
corpus_vectors = []
for sentence in corpus:
    sentence_vectors = []
    for word in sentence.split():
        try:
            sentence_vectors.append(word2vec_model.wv[word])
        except KeyError:
            sentence_vectors.append(np.random.rand(100))  # initialize randomly
    corpus_vectors.append(sentence_vectors)

# Pad each sentence in the corpus to a fixed length, as required by the LSTM model
'''
Padding is performed by adding filler values (usually zeros) to the sequences until they reach the desired length.
Padding is necessary for LSTMs (Long Short-Term Memory) that process input sequences of a fixed length.


There are two common padding strategies:
Pre-padding: Filler values are added to the beginning of the sequence.
Post-padding: Filler values are added to the end of the sequence.

The max_len variable is set to the length of the longest sequence in the corpus. B
y padding all sequences to this length, we ensure that the input to the LSTM model is
standardized and can be processed correctly.

It's important to note that padding can affect model performance.
Very short sequences may be dominated by the padding values, which can introduce
noise and make it harder for the model to learn meaningful patterns.
On the other hand, excessively long padding can lead to increased computational
requirements and memory consumption.
Choosing an appropriate padding length based on the dataset and problem is crucial for achieving good performance.
'''

max_len = max(len(sentence) for sentence in corpus_vectors)
corpus_vectors_padded = np.zeros((len(corpus_vectors), max_len, embedding_dim), dtype='float32')

for i, sentence_vectors in enumerate(corpus_vectors):
    for j, vector in enumerate(sentence_vectors):
        corpus_vectors_padded[i, j, :] = vector

# Build an LSTM model in Keras with appropriate input and output layers
embedding_dim = 100
model = Sequential()
model.add(Input(shape=(max_len, embedding_dim)))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(embedding_dim))
model.compile(loss='mse', optimizer='adam')

# Train the LSTM model on the prepared data
model.fit(corpus_vectors_padded, target_vectors, epochs=1, batch_size=32, validation_split=0.1)



<keras.callbacks.History at 0x223a22c1850>

In [25]:
model.fit(corpus_vectors_padded, target_vectors, epochs=1, batch_size=32, validation_split=0.1)



<keras.callbacks.History at 0x2216aaa3190>

In [23]:
corpus[25]

'plama piętaka jedna spośród kilku najznakomitszych współczesnych powieści także ze względu na jej zaklasyfikowanie wraz z całą twórczością tego pisarza do nurtu wiejskiego nie ma w odbiorze powszechnym tej rangi jaką rzeczywiście posiada'

In [26]:
def preprocess_input_text(text, max_len):
    # Transform input text into vector representation
    input_vectors = []
    for word in text.split():
        try:
            input_vectors.append(word2vec_model.wv[word])
        except KeyError:
            input_vectors.append(np.random.rand(embedding_dim))  # initialize randomly

    # Pad the input text to the required length
    input_vectors_padded = np.zeros((1, max_len, embedding_dim), dtype='float32')
    for j, vector in enumerate(input_vectors):
        input_vectors_padded[0, j, :] = vector

    return input_vectors_padded

def generate_text(model, input_text, num_words_to_generate=10):
    generated_text = input_text

    for _ in range(num_words_to_generate):
        # Preprocess and pad the input text
        input_vectors_padded = preprocess_input_text(generated_text, max_len)

        # Predict the next word using the LSTM model
        prediction = model.predict(input_vectors_padded)
        predicted_vector = prediction[0]

        # Find the corresponding word for the predicted index
        predicted_word = word2vec_model.wv.most_similar(positive=[predicted_vector], topn=1)[0][0]

        # Append the predicted word to the input text
        generated_text += " " + predicted_word

    return generated_text


# Example usage
input_text = "kilku najznakomitszych"
num_words_to_generate = 4
generated_text = generate_text(model, input_text, num_words_to_generate)
print(generated_text)

kilku najznakomitszych kosova kosova kosova kosova
