In [1]:
!pip install wikipedia-api


Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=534b8e759b98cbcf84a6ab3c899e030e35533e2f9c85e57b223b2bf090dea01c
  Stored in directory: /root/.cache/pip/wheels/1d/f8/07/0508c38722dcd82ee355e9d85e33c9e9471d4bec0f8ae72de0
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [4]:
# Install required packages
!pip install wikipedia-api 

import wikipediaapi
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
import random

# Download NLTK resources
nltk.download('punkt')

# Function to fetch real-time text from Wikipedia
def fetch_wikipedia_text(topic="Artificial Intelligence"):
    wiki = wikipediaapi.Wikipedia(language='en', user_agent='My_Wikipedia_App')  
    page = wiki.page(topic)
    if page.exists():
        return page.text[:5000]  # Fetch first 5000 characters
    return "No content found."

# Preprocessing text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    return " ".join(tokens)

# Get real-time text
topic = "Machine Learning"
text_data = fetch_wikipedia_text(topic)
cleaned_text = preprocess_text(text_data)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
total_words = len(tokenizer.word_index) + 1

# Create input-output sequences
input_sequences = []
words = cleaned_text.split()
for i in range(1, len(words)):
    n_gram_sequence = words[:i+1]
    encoded_seq = tokenizer.texts_to_sequences([" ".join(n_gram_sequence)])[0]
    input_sequences.append(encoded_seq)

# Pad sequences
max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')

# Split input (X) and output (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define LSTM Model
def build_lstm_model():
    with tf.device('/GPU:0'):  # Ensure running on GPU
        model = Sequential([
            Embedding(total_words, 50),
            LSTM(128, return_sequences=True),
            LSTM(64),
            Dense(total_words, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define GRU Model
def build_gru_model():
    with tf.device('/GPU:0'):  # Ensure running on GPU
        model = Sequential([
            Embedding(total_words, 50),
            GRU(128, return_sequences=True),
            GRU(64),
            Dense(total_words, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train LSTM Model
lstm_model = build_lstm_model()
lstm_model.fit(X, y, epochs=6, verbose=1)

# Train GRU Model
gru_model = build_gru_model()
gru_model.fit(X, y, epochs=6, verbose=1)

# Improved next-word prediction function
def predict_next_words(model, seed_text, num_words=3):
    output_text = seed_text
    
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([output_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_length-1, padding='pre')
        
        predicted_probs = model.predict(token_list, verbose=0)  # Get probabilities
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]  # Get most probable index
        
        output_word = None
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        if output_word is None:  # If no valid word is found, stop
            break
        
        output_text += " " + output_word

    return output_text

# Test predictions
seed_sentence = "Machine learning is"
lstm_prediction = predict_next_words(lstm_model, seed_sentence, 3)
gru_prediction = predict_next_words(gru_model, seed_sentence, 3)

print("\nLSTM Prediction:", lstm_prediction)
print("GRU Prediction:", gru_prediction)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - accuracy: 0.0115 - loss: 5.8943
Epoch 2/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - accuracy: 0.0367 - loss: 5.5776
Epoch 3/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - accuracy: 0.0400 - loss: 5.4278
Epoch 4/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.0247 - loss: 5.4651
Epoch 5/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - accuracy: 0.0406 - loss: 5.3549
Epoch 6/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - accuracy: 0.0487 - loss: 5.3266
Epoch 1/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.0140 - loss: 5.9061
Epoch 2/6
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m