In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
!pip install wikipedia


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=2118f50d72e9f881c6a8c0f09b3ddc0908df69f296511b919b5de3bef6a3720b
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Sequential
import wikipedia

# preprocess text at the character level
def preprocess_text_char(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Keep only alphabetic characters and spaces
    filtered_text = [char for char in text if char.isalpha() or char.isspace()]

    return ''.join(filtered_text)  # Join characters back into a string

topic = "Visual Arts"
wikipedia.set_lang("en")  
try:
    wikipedia_page = wikipedia.page(topic)
    wikipedia_text = wikipedia_page.content
except wikipedia.exceptions.PageError:
    print("Page not found. Please try another topic.")
    exit()

preprocessed_text = preprocess_text_char(wikipedia_text)

seq_length = 100  # Length of input sequences
sequences = []
next_chars = []

for i in range(len(preprocessed_text) - seq_length):
    sequences.append(preprocessed_text[i:i + seq_length])
    next_chars.append(preprocessed_text[i + seq_length])

# Convert sequences to numeric data
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(sequences)

# Convert characters to sequences of integers for sequences
sequences = char_tokenizer.texts_to_sequences(sequences)
sequences = np.array(sequences)

# Total vocabulary size (number of unique characters)
vocab_size = len(char_tokenizer.word_index) + 1

model_char = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=seq_length),
    SimpleRNN(units=150),
    Dense(units=vocab_size, activation='softmax')
])

model_char.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_char.summary()

sequences = sequences.astype(np.float32)  # Ensure sequences are float32
next_sequences = np.array([char_tokenizer.texts_to_sequences([char])[0][0] for char in next_chars])  # Convert next_chars to sequences
next_sequences = next_sequences.astype(np.int32)  # Ensure next_sequences are int32

model_char.fit(sequences, next_sequences, epochs=30, verbose=1)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          4500      
                                                                 
 simple_rnn (SimpleRNN)      (None, 150)               37650     
                                                                 
 dense (Dense)               (None, 45)                6795      
                                                                 
Total params: 48945 (191.19 KB)
Trainable params: 48945 (191.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epo

<keras.src.callbacks.History at 0x7c51e4dd1cf0>

In [11]:
def generate_text(seed_text, max_length=500):
    generated_text = seed_text

    for _ in range(max_length):
        # Tokenize the current generated text
        token_list = char_tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        # Predict the next character
        predicted_char_idx = np.argmax(model_char.predict(token_list), axis=-1)

        # Convert the predicted character index to a list
        predicted_char_idx_list = predicted_char_idx.tolist()

        # Get the predicted character from the tokenizer
        predicted_char = char_tokenizer.sequences_to_texts([predicted_char_idx_list])[0]

        # Append the predicted character to the generated text
        generated_text += predicted_char

        # Break if the predicted character is a newline or end-of-sentence marker
        if predicted_char == '\n':
            break

    return generated_text

# Testing character-based prediction
test_char = "pain"
predicted_next_char = generate_text(test_char, max_length=100)
print("Next character prediction:", predicted_next_char)

Next character prediction: painter or in artists and plastic arts and conceptual and painters the earloch considered by materials a
