In [1]:
import pathlib
import os
import xml.etree.ElementTree as ET
import nltk
import string
import pickle

from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cubix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cubix\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def save_tokenized_sentences(tokenized_sentences, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(tokenized_sentences, file)

def load_tokenized_sentences(file_path):
    with open(file_path, 'rb') as file:
        tokenized_sentences = pickle.load(file)
    return tokenized_sentences

def read_tokenized_sentences(folder_path):

    data_path = pathlib.Path(folder_path)
    paths = list(data_path.iterdir())
    tokenized_sentences = []

    for path in tqdm(paths, desc=folder_path):
        for name in os.listdir(path):
            if name == "text_structure.xml":
                file_path = os.path.join(path, name)
                tree = ET.parse(file_path)
                root = tree.getroot()
                elements = root.findall(".//{http://www.tei-c.org/ns/1.0}p")
                for element in elements:
                    try:
                        text = "".join(element.text)
                        tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
                        sentences = tokenizer.tokenize(text)
                        for sentence in sentences:
                            tokenized_sentences.append(sentence.split())
                        # for sentence in sent_tokenize(text, language="polish"):
                        #     tokenized_sentences.append(word_tokenize(sentence, language="polish"))

                    except TypeError:
                        pass

    return tokenized_sentences

def preprocess_text(sentences):
    table = str.maketrans("", "", string.punctuation) # removing punctuation - it has to be a conscious decision
    tokenized_sentences = [[word.lower().translate(table) for word in sentence] for sentence in sentences]
    new_tokenized_sentences = []
    for sentence in tokenized_sentences:
        new_sentence = []
        for x in sentence:
            if x != '':
                new_sentence.append(x)
        new_tokenized_sentences.append(new_sentence)
        # new_tokenized_sentences.append(' '.join(new_sentence))

    removed_empty_sentences = []
    for sentence in new_tokenized_sentences:
        if len(sentence) > 1:
            removed_empty_sentences.append(sentence)

    return removed_empty_sentences

In [73]:
wikipedia_sentences = read_tokenized_sentences('nkjp-wikipedia')
len(wikipedia_sentences)

nkjp-wikipedia: 100%|██████████| 634/634 [05:08<00:00,  2.05it/s]


6310933

In [74]:
scal_paths = [
    '5SCAL-free/4/IJP/_internet/senat/xml/k5',
    '5SCAL-free/4/IJP/_internet/senat/xml/k6',
    '5SCAL-free/4/IJP/_internet/senat/xml/k7',
    '5SCAL-free/KONWERTOWANE/IPI/KomisjaSledczaRywin',
    '5SCAL-free/KONWERTOWANE/IPI/KSIAZKI-popr',
    '5SCAL-free/KONWERTOWANE/IPI/law2-agr',
    '5SCAL-free/KONWERTOWANE/IPI/sejm',
    '5SCAL-free/KONWERTOWANE/IPI/senat',
    '5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/misc-published/sejmowe',
    '5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/misc-published/ustawy',
    '5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/transcript'
]

scal_sentences = []

for folder in scal_paths:
    scal_sentences += read_tokenized_sentences(folder)

5SCAL-free/4/IJP/_internet/senat/xml/k5: 100%|██████████| 29/29 [00:00<00:00, 72.86it/s]
5SCAL-free/4/IJP/_internet/senat/xml/k6: 100%|██████████| 39/39 [00:00<00:00, 63.93it/s]
5SCAL-free/4/IJP/_internet/senat/xml/k7: 100%|██████████| 54/54 [00:00<00:00, 69.44it/s]
5SCAL-free/KONWERTOWANE/IPI/KomisjaSledczaRywin: 100%|██████████| 89/89 [00:01<00:00, 71.08it/s]
5SCAL-free/KONWERTOWANE/IPI/KSIAZKI-popr: 100%|██████████| 2/2 [00:00<00:00, 11.98it/s]
5SCAL-free/KONWERTOWANE/IPI/law2-agr: 100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
5SCAL-free/KONWERTOWANE/IPI/sejm: 100%|██████████| 968/968 [00:28<00:00, 34.25it/s]
5SCAL-free/KONWERTOWANE/IPI/senat: 100%|██████████| 553/553 [00:11<00:00, 46.19it/s]
5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/misc-published/sejmowe: 100%|██████████| 44/44 [00:00<00:00, 59.60it/s]
5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/misc-published/ustawy: 100%|██████████| 11/11 [00:00<00:00, 12.02it/s]
5SCAL-free/KONWERTOWANE/Lodz/pelcra/pelcra_xml/transcrip

In [75]:
len(scal_sentences)

573561

In [76]:
wikipedia_sentences = preprocess_text(wikipedia_sentences)

In [77]:
len(wikipedia_sentences)

5767033

In [78]:
scal_sentences = preprocess_text(scal_sentences)

In [79]:
len(scal_sentences)

445267

In [80]:
scal_sentences[0]

['14',
 'stycznia',
 '2003',
 'r',
 'komisja',
 'śledcza',
 'do',
 'zbadania',
 'ujawnionych',
 'w',
 'mediach',
 'zarzutów',
 'dotyczących',
 'przypadków',
 'korupcji',
 'podczas',
 'prac',
 'nad',
 'nowelizacją',
 'ustawy',
 'o',
 'radiofonii',
 'i',
 'telewizji',
 'obradująca',
 'pod',
 'przewodnictwem',
 'marszałka',
 'sejmu',
 'marka',
 'borowskiego',
 'dokonała']

In [81]:
wikipedia_sentences[0]

['awk',
 'jest',
 'interpretowanym',
 'językiem',
 'programowania',
 'którego',
 'główną',
 'funkcją',
 'jest',
 'wyszukiwanie',
 'i',
 'przetwarzanie',
 'wzorców']

In [82]:
save_tokenized_sentences(scal_sentences, 'pickles/scal_tokenized_sentences.pkl')
save_tokenized_sentences(wikipedia_sentences, 'pickles/wikipedia_tokenized_sentences.pkl')

In [3]:
scal_sentences = load_tokenized_sentences('pickles/scal_tokenized_sentences.pkl')
wikipedia_sentences = load_tokenized_sentences('pickles/wikipedia_tokenized_sentences.pkl')

tokenized_sentences = []
tokenized_sentences.extend(scal_sentences)
tokenized_sentences.extend(wikipedia_sentences)

MemoryError: 

In [5]:
len(tokenized_sentences)

6212300

In [None]:
# Word-level generation and prediction
# Toy-problem, not using the previous corpuses
# Import necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input
from gensim.models import Word2Vec

# Load the Word2Vec model and text corpus data
word2vec_model = Word2Vec.load("word2vec_large_model.model")
# corpus = ['This is the second sentence to train the model', 'This is the first sentence', 'This is the third sentence just because']
corpus = tokenized_sentences
embedding_dim = 100


def extract_target_words(corpus):
    target_words = []
    for sentence in corpus:
        # words = sentence.split()
        target_word = sentence[-1]  # Get the last word in the sentence
        target_words.append(target_word)
    return target_words

target_words = extract_target_words(corpus)

# Create target_vectors
target_vectors = np.zeros((len(target_words), embedding_dim))

for idx, word in enumerate(target_words):
    try:
        target_vectors[idx] = word2vec_model.wv[word]
    except KeyError:
        target_vectors[idx] = np.random.rand(embedding_dim)  # initialize randomly

# Transform each word in the corpus into a corresponding vector representation
corpus_vectors = []
for sentence in corpus:
    sentence_vectors = []
    for word in sentence:
        try:
            sentence_vectors.append(word2vec_model.wv[word])
        except KeyError:
            sentence_vectors.append(np.random.rand(100))  # initialize randomly
    corpus_vectors.append(sentence_vectors)

# Pad each sentence in the corpus to a fixed length, as required by the LSTM model
'''
Padding is performed by adding filler values (usually zeros) to the sequences until they reach the desired length.
Padding is necessary for LSTMs (Long Short-Term Memory) that process input sequences of a fixed length.


There are two common padding strategies:
Pre-padding: Filler values are added to the beginning of the sequence.
Post-padding: Filler values are added to the end of the sequence.

The max_len variable is set to the length of the longest sequence in the corpus. B
y padding all sequences to this length, we ensure that the input to the LSTM model is
standardized and can be processed correctly.

It's important to note that padding can affect model performance.
Very short sequences may be dominated by the padding values, which can introduce
noise and make it harder for the model to learn meaningful patterns.
On the other hand, excessively long padding can lead to increased computational
requirements and memory consumption.
Choosing an appropriate padding length based on the dataset and problem is crucial for achieving good performance.
'''

max_len = max(len(sentence) for sentence in corpus_vectors)
corpus_vectors_padded = np.zeros((len(corpus_vectors), max_len, embedding_dim), dtype='float32')

for i, sentence_vectors in enumerate(corpus_vectors):
    for j, vector in enumerate(sentence_vectors):
        corpus_vectors_padded[i, j, :] = vector

# Build an LSTM model in Keras with appropriate input and output layers
embedding_dim = 100
model = Sequential()
model.add(Input(shape=(max_len, embedding_dim)))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(embedding_dim))
model.compile(loss='mse', optimizer='adam')

# Train the LSTM model on the prepared data
model.fit(corpus_vectors_padded, target_vectors, epochs=1, batch_size=32, validation_split=0.1)