In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np
import re
import os
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords


In [3]:
folder_path = r"C:\Backup\Desktop\Xitsonga-Text Generation-LSTM\dataset"

files = os.listdir(folder_path)

data = []
for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):  
        with open(file_path, 'r', encoding='utf-8') as file:
            data.extend(file.readlines())

print(data[:10])


['Hlaya xiletelo xa xikombelo eka pheji 7 hi vukheta, tani hilaha xi nga na mahungu ya nkoka lama nga ta ku pfuna eka ku tata fomo ya xikombelo kahle.\n', 'tata fomo ya xikombelo leyi nga laha ndzhaku no vona leswaku u nghenisile tidokhumente hinkwato leti lavekaka na nsayino wa wena laha wu lavekaka.\n', "Rhumela fomo na tidokhumente tin'wana to engetela eka gEMs hi yin'wana ya tindlela leti landzelaka:\n", 'Fekisi: 0861 00 4367\n', "Tisenthara ta xifundza to yisa hi voko: Languta xiletelo xa xikombelo eka pheji 7 ku kuma vuxokoxoko byin'wana.\n", 'Loko xikombelo xa wena xi amukeriwile, GEMS yi ta ku rhumela phasela ro ku amukela eka masiku ya 7 endzhaku ka ku amukela xikombelo xa wena.\n', 'Xikombelo xa wena xi ta hlwela ku kambisisiwa loko u nga nyiki GEMS tidokhumente hinkwato leti lavekaka.\n', 'Loko xikombelo xa wena xi nga kambisisiwangi, GEMS yi ta tihlanganisa na wena eka masiku ya 15\n', 'ya ku amukela xikombelo xa wena.\n', 'Tihlanganise na senthara ya hina ya tiqingho eka 0

## Data Cleaning

In [None]:
import itertools

# Existing stopwords
xitsonga_stopwords = [
    "ku", "a", "i", "e", "o", "le", "ti", "to", 
    "na", "ni", "ka", "va", "hi", "lo", "ya", "ma"
]

# Generate all consonant-vowel and vowel-consonant combinations
vowels = ['a', 'e', 'i', 'o', 'u']
consonants = [chr(c) for c in range(ord('a'), ord('z') + 1) if chr(c) not in vowels]

# Add consonant-vowel and vowel-consonant combinations
combinations = set(
    ["".join(pair) for pair in itertools.product(consonants, vowels)] +
    ["".join(pair) for pair in itertools.product(vowels, consonants)]
)

# Combine with existing stopwords
xitsonga_stopwords.extend(combinations)

# Ensure no duplicates
xitsonga_stopwords = list(set(xitsonga_stopwords))

# Filter stopwords to only include words with less than 3 characters
xitsonga_stopwords = [word for word in xitsonga_stopwords if len(word) < 3]

# Sort for easier debugging
xitsonga_stopwords.sort()

print(xitsonga_stopwords[:50])  # Display a subset to check


In [6]:
# Define the function to remove Xitsonga stopwords
def remove_stopwords(text, stopwords):
    # Convert text to lowercase
    text = text.lower()
    # Split text into words
    words = text.split()
    # Remove words that are in the stopwords list
    filtered_words = [word for word in words if word not in stopwords]
    # Join back into a string
    return " ".join(filtered_words)

# Apply the stopword removal to the dataset
cleaned_data_without_stopwords = [remove_stopwords(line, xitsonga_stopwords) for line in data]

# Preview the cleaned dataset
print("Original Line:", data[0])  # Original line for reference
print("Cleaned Line:", cleaned_data_without_stopwords[0])  # Line after stopword removal


Original Line: Hlaya xiletelo xa xikombelo eka pheji 7 hi vukheta, tani hilaha xi nga na mahungu ya nkoka lama nga ta ku pfuna eka ku tata fomo ya xikombelo kahle.

Cleaned Line: hlaya xiletelo xikombelo eka pheji 7 vukheta, tani hilaha nga mahungu nkoka lama nga pfuna eka tata fomo xikombelo kahle.


In [7]:
# defining a function that will remove the wtitespace, convert into  lowercase
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the stopword-filtered data
fully_cleaned_data = [clean_text(line) for line in cleaned_data_without_stopwords]

# Preview the results
print( cleaned_data_without_stopwords[0])
print( fully_cleaned_data[0])  


hlaya xiletelo xikombelo eka pheji 7 vukheta, tani hilaha nga mahungu nkoka lama nga pfuna eka tata fomo xikombelo kahle.
hlaya xiletelo xikombelo eka pheji vukheta tani hilaha nga mahungu nkoka lama nga pfuna eka tata fomo xikombelo kahle


## Preprocessing

In [None]:
tokenizer = Tokenizer()

# Fit the tokenizer on the cleaned data to build a word index
tokenizer.fit_on_texts(fully_cleaned_data)

# Convert the cleaned text into sequences of tokens (integers)
sequences = tokenizer.texts_to_sequences(fully_cleaned_data)

# Determine the maximum sequence length (e.g. choose 30 for this case)
max_sequence_length = 30  

# Padding  the sequences to ensure they all have the same length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')


print("Padded Sequences:")
print(padded_sequences[:10])  # Display first 10 sequences of the paaded sequence

# Check the tokenizer word index and size of the vocabulary
print("\nTokenizer Word Index:")
print(tokenizer.word_index)
print("\nVocabulary Size:", len(tokenizer.word_index) + 1) 


In [9]:
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]   # The last word (target)


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the datasets
print(f"Training Data: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Validation Data: X_val shape = {X_val.shape}, y_val shape = {y_val.shape}")
print(f"Test Data: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")


Training Data: X_train shape = (1452, 29), y_train shape = (1452,)
Validation Data: X_val shape = (182, 29), y_val shape = (182,)
Test Data: X_test shape = (182, 29), y_test shape = (182,)


In [None]:
# Define the LSTM model architecture
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, 
                    output_dim=150, 
                    input_length=X_train.shape[1]))

# LSTM layer
model.add(LSTM(units=256, return_sequences=False, dropout=0.3, recurrent_dropout=0.1))

# Dense layer for output (softmax activation for multi-class classification)
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Compile the model optimized adam, learning rate, and metric parameters
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [11]:
# Defining EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', # monitoring the valdidation loss
                               patience=10,  # Number of epochs with no improvement before stopping
                               verbose=1,
                               restore_best_weights=True) # when training staert to overfit

# Train the model with early stopping
history = model.fit(X_train, y_train, 
                    epochs=100,  
                    batch_size=128,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])


Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 176ms/step - accuracy: 0.3124 - loss: 8.2486 - val_accuracy: 0.7198 - val_loss: 8.1846
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 130ms/step - accuracy: 0.7995 - loss: 8.1330 - val_accuracy: 0.7253 - val_loss: 7.9215
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 133ms/step - accuracy: 0.7902 - loss: 7.6894 - val_accuracy: 0.7253 - val_loss: 6.7732
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 134ms/step - accuracy: 0.7788 - loss: 6.2974 - val_accuracy: 0.7253 - val_loss: 5.2663
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 136ms/step - accuracy: 0.8034 - loss: 4.6488 - val_accuracy: 0.7253 - val_loss: 4.1728
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 135ms/step - accuracy: 0.7797 - loss: 3.5966 - val_accuracy: 0.7253 - val_loss: 3.3164
Epoch 7/100
[1m12/12

In [None]:
# Plot accuracy
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.show()

# Plot loss
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## Text generation

In [14]:
import numpy as np
import tensorflow as tf

def generate_text(model, tokenizer, seed_text, max_sequence_len, num_words_to_generate=50, temperature=1.0):
    
    # Step 1: Preprocess the seed text (same preprocessing done during training)
    seed_text = seed_text.lower()  # Convert to lowercase (if that was part of your preprocessing)
    
    # Tokenize the seed text
    seed_tokens = tokenizer.texts_to_sequences([seed_text])
    
    
    seed_tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(seed_tokens, maxlen=max_sequence_len, padding='pre')

    generated_text = seed_text
    
    # Step 2: Generate text by predicting the next token iteratively
    for _ in range(num_words_to_generate):
        # Predict the next token probabilities
        predictions = model.predict(seed_tokens_padded, verbose=0)

     
        predictions = predictions[0, :]  
        predictions = predictions / temperature  
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))  # Softmax function for probabilities

        
        next_token = np.random.choice(len(predictions), p=predictions)

        # Decoding the token to a word
        next_word = tokenizer.index_word.get(next_token, '')
        generated_text += ' ' + next_word
        
       
        seed_tokens_padded = np.roll(seed_tokens_padded, shift=-1, axis=1)  
        seed_tokens_padded[0, -1] = next_token  

    return generated_text

def print_in_lines(text, words_per_line=15):
    words = text.split()
    for i in range(0, len(words), words_per_line):
        print(" ".join(words[i:i + words_per_line]))



In [15]:
# seed text
seed_text = "vukheta"  
max_sequence_len = 30  # per tokenize sequence length


generated_text = generate_text(model, tokenizer, seed_text, max_sequence_len, num_words_to_generate=30, temperature=1.0)

# Printing the generated text in lines with 10 words per line
print("Generated Text:")
print_in_lines(generated_text, words_per_line=10)


Generated Text:
vukheta tiejente tihuvo minharhu lavakulu tk lowunsthwa veke cancel thlelo
ngopfu tivonaka kwetlembetana tintshwa kamberile kahlekahle khorwisaka lokou tikumela xiyenganax
rhekodiweke dyondziwa siveleka timbhoni kayetiwa khalikhuletiwa vukhongeri tlhandlekela wenae afrikadzonga
vula
