In [1]:
# Install required packages
!pip install wikipedia-api 

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=10182e4d3c8d68ffaf18f583b76c297da12c18182a7114ca073b273a30a367f7
  Stored in directory: /root/.cache/pip/wheels/1d/f8/07/0508c38722dcd82ee355e9d85e33c9e9471d4bec0f8ae72de0
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [4]:
import numpy as np
import tensorflow as tf
import wikipediaapi
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK resources
nltk.download('punkt')

# Function to fetch real-time text from Wikipedia
def fetch_wikipedia_text(topic="Artificial Intelligence"):
    wiki = wikipediaapi.Wikipedia(language='en', user_agent='My_Wikipedia_App')  
    page = wiki.page(topic)
    if page.exists():
        return page.text[:5000]  # Fetch first 5000 characters
    return "No content found."

# Preprocessing text
def preprocess_text(text):
    return "".join([char.lower() for char in text if char.isalnum() or char.isspace()])

# Fetch and clean Wikipedia text
topic = "Machine Learning"
text_data = fetch_wikipedia_text(topic)
cleaned_text = preprocess_text(text_data)

# Create character-level vocabulary
chars = sorted(set(cleaned_text))  # Unique characters
char_to_index = {char: i for i, char in enumerate(chars)}
index_to_char = {i: char for char, i in char_to_index.items()}
total_chars = len(chars)

# Prepare input-output sequences (sliding window approach)
seq_length = 4  # Sequence length increased for better learning
input_sequences, output_chars = [], []

for i in range(len(cleaned_text) - seq_length):
    input_seq = cleaned_text[i:i + seq_length]
    output_char = cleaned_text[i + seq_length]
    input_sequences.append([char_to_index[c] for c in input_seq])
    output_chars.append(char_to_index[output_char])

# Convert to NumPy arrays
X = np.array(input_sequences)
y = tf.keras.utils.to_categorical(output_chars, num_classes=total_chars)

# Define RNN Model
def build_rnn_model():
    with tf.device('/GPU:0'):
        model = Sequential([
            Embedding(total_chars, 50),  # Removed input_length
            SimpleRNN(128, return_sequences=True),
            SimpleRNN(64),
            Dense(total_chars, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define LSTM Model
def build_lstm_model():
    with tf.device('/GPU:0'):
        model = Sequential([
            Embedding(total_chars, 50),  # Removed input_length
            LSTM(128, return_sequences=True),
            LSTM(64),
            Dense(total_chars, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define GRU Model
def build_gru_model():
    with tf.device('/GPU:0'):
        model = Sequential([
            Embedding(total_chars, 50),  # Removed input_length
            GRU(128, return_sequences=True),
            GRU(64),
            Dense(total_chars, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train models with 10 epochs for better accuracy
rnn_model = build_rnn_model()
lstm_model = build_lstm_model()
gru_model = build_gru_model()

rnn_model.fit(X, y, epochs=10, verbose=1)
lstm_model.fit(X, y, epochs=10, verbose=1)
gru_model.fit(X, y, epochs=10, verbose=1)

# Next-character prediction function
def predict_next_chars(model, seed_text, num_chars=3):
    for _ in range(num_chars):
        token_list = [char_to_index[c] for c in seed_text if c in char_to_index]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')  # Ensure correct length
        
        predicted_index = np.argmax(model.predict(token_list), axis=-1)[0]
        output_char = index_to_char.get(predicted_index, "?")  # Handle unknown characters
        
        seed_text += output_char
    return seed_text

# **Testing character prediction**
word = "go"
predicted_rnn = predict_next_chars(rnn_model, word, 3)
predicted_lstm = predict_next_chars(lstm_model, word, 3)
predicted_gru = predict_next_chars(gru_model, word, 3)

print("\nRNN Prediction:", predicted_rnn)
print("LSTM Prediction:", predicted_lstm)
print("GRU Prediction:", predicted_gru)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.1404 - loss: 3.1509
Epoch 2/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2369 - loss: 2.6678
Epoch 3/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3190 - loss: 2.3847
Epoch 4/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3521 - loss: 2.2216
Epoch 5/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3875 - loss: 2.1400
Epoch 6/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4033 - loss: 2.0590
Epoch 7/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4146 - loss: 2.0154
Epoch 8/10
[1m153/153[0m [32m━━━━━━━━━━━━