In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
import os
import csv

folder_path = 'D:\\MachineCourse\\NLP_Course\\Tasks\\CBOW\\dataset'
os.makedirs(folder_path, exist_ok=True)
count = 1
with open('D:\\MachineCourse\\NLP_Course\\Tasks\\CBOW\\people_wiki.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        URI,name, text = row
        with open(os.path.join(folder_path, f'doc{count}.txt'), 'w') as text_file:
            text_file.write(text)
            count += 1

In [None]:
def remove_html_tags(text):
    html_pattern = r'<.*?>'
    without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
    return without_html


In [None]:
def convert_to_lower(text):
    return text.lower()

In [None]:
def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
    return without_urls

In [None]:
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download("stopwords")
def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

In [None]:
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [None]:
def clean(text):
    text=remove_html_tags(text)
    text=convert_to_lower(text)
    text=remove_urls(text)
    text=remove_numbers(text)
    #text=remove_stopwords(text)
    text=remove_extra_white_spaces(text)
    return text
    

In [None]:
import os
folder_path = 'D:\\MachineCourse\\NLP_Course\\Tasks\\CBOW\\dataset'
def read_text_files(folder_path):
    all_words = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                text=clean(text)
                words=text.split()
                all_words.append(words)
    return all_words
words = read_text_files(folder_path)

In [None]:
data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))


Split data into X and Y

In [None]:
X = [x for x, y in data]
y = [y for x, y in data]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([word for sentence in X for word in sentence] + y)

Encoding

In [None]:
# Assuming X is incorrectly structured as lists of lists, fix the structure
X_fixed = [' '.join(map(str, context)) for context in X]

X_encoded = tokenizer.texts_to_sequences(X_fixed)
y_encoded = tokenizer.texts_to_sequences(y)



Padding

In [None]:
X_padded = pad_sequences(X_encoded, maxlen=4, padding='post')
y_padded = pad_sequences(y_encoded, maxlen=1, padding='post')

Split data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)

Build Model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=4))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history=model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test))


Save Model

In [None]:
!pip install h5py

In [None]:
import h5py
model.save('checkpoint.h5')

Evaluate Model

In [None]:
model.evaluate(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict['loss']
acc=history_dict['accuracy']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
history_dict = history.history
plt.clf()
acc_values = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
def predict_next_word(model, tokenizer, context_str, top_n=3):
    cleaned_context = clean(context_str)
    sequence = tokenizer.texts_to_sequences([cleaned_context])
    padded_sequence = pad_sequences(sequence, maxlen=4, padding='post')
    preds = model.predict(padded_sequence, verbose=0)[0]
    top_indices = preds.argsort()[-top_n:][::-1]
    index_word = {v: k for k, v in tokenizer.word_index.items()}
    top_words = [index_word[idx] for idx in top_indices if idx in index_word]
    return top_words


In [None]:
loaded_model = load_model('checkpoint.h5')
context_example_loaded = "he go to "
predicted_words_loaded = predict_next_word(loaded_model, tokenizer, context_example_loaded)
print("Predicted words with loaded model:", predicted_words_loaded)