In [10]:
import tensorflow as tf
import numpy as np
import zipfile
import os
import re
import matplotlib.pyplot as plt
from keras_preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam

In [11]:
# Step 1: Unzip and read the text dataset
with zipfile.ZipFile('synthetic_text_100MB.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset')  # Extract to 'dataset' folder
text_data = ''
for file_name in os.listdir('dataset'):
    file_path = os.path.join('dataset', file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data += file.read() + ' '  # Combine all text files

In [12]:
# Step 2: Clean the text
text_data = text_data.lower()  # Convert to lowercase
text_data = re.sub(r'\s+', ' ', text_data)  # Replace multiple spaces with single space
text_data = re.sub(r'[^\w\s.,!?]', '', text_data)  # Keep only letters, numbers, and basic punctuation
sentences = text_data.split('.')  # Split into sentences
sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty sentences

In [16]:
# Step 3: Tokenize the text
vocab_size = 4000  # Maximum number of unique words
sequence_length = 20  # Length of each input sequence
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')  # Create tokenizer
tokenizer.fit_on_texts(sentences)  # Build vocabulary
word_sequences = tokenizer.texts_to_sequences(sentences)  # Convert sentences to number sequences
word_index = tokenizer.word_index  # Dictionary of word-to-index mappings


In [17]:
# Step 4: Create training data
input_data = []
for sequence in word_sequences:
    for i in range(1, len(sequence)):
        n_gram = sequence[:i+1]  # Create n-grams for each sequence
        input_data.append(n_gram)
input_data = pad_sequences(input_data, maxlen=sequence_length, padding='pre')  # Pad sequences
X_train = input_data[:, :-1]  # Input is all but last word
y_train = input_data[:, -1]  # Target is last word


In [19]:
# Step 5: Build the model
model = Sequential()
model.add(Embedding(vocab_size, 100))  # Convert words to vectors
model.add(LSTM(150))  # Process sequences with LSTM
model.add(Dense(vocab_size, activation='softmax'))  # Predict next word
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [20]:
# Step 6: Train the model
batch_size = 128 
epochs = 5 
training_history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)

Epoch 1/5
[1m 5823/88020[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:11:25[0m 52ms/step - accuracy: 0.6983 - loss: 1.3680

KeyboardInterrupt: 

In [24]:
prompt = "Artificial Intelligence will"
num_words_to_generate = 10  # words per sentence
generated_sentences = []

for _ in range(3):  # generate 3 separate sentences
    generated_text = prompt.lower()
    for _ in range(num_words_to_generate):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word.get(predicted_word_index, '')
        generated_text += ' ' + predicted_word
        if predicted_word in '.!?':  # stop when sentence ends
            break
    generated_sentences.append(generated_text.strip())

In [25]:
with open('training_report.txt', 'w') as report_file:
    report_file.write('Training Report\n\n')
    report_file.write('Model Details:\n')
    report_file.write('- Word Embedding: 4,000 words, 100 dimensions\n')
    report_file.write('- LSTM Layer: 150 units\n')
    report_file.write('- Output Layer: 4,000 units with softmax\n\n')
    report_file.write('Training Settings:\n')
    report_file.write(f'- Epochs: {epochs}\n')
    report_file.write(f'- Batch Size: {batch_size}\n')
    report_file.write('- Optimizer: Adam (learning rate 0.001)\n')
    report_file.write('- Loss Function: Sparse Categorical Crossentropy\n')
    report_file.write('Generated Text (Prompt: "Artificial Intelligence will"):\n')
    for i, sentence in enumerate(generated_sentences, 1):
        report_file.write(f'Sentence {i}: {sentence}.\n')