In [2]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=b1c55f32845863f4e0b2c342eb7d9c886631115fcf1a95ea1a0d4824af82d3be
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import wikipedia
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [6]:
# Function to fetch data from Wikipedia
def get_wikipedia_data(topic, num_sentences):
    wikipedia.set_lang("en")
    try:
        content = wikipedia.page(topic).content
        sentences = content.split(". ")
        data = []
        for i in range(len(sentences) - num_sentences):
            data.append(" ".join(sentences[i:i+num_sentences]))
        return data
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        return None
    except wikipedia.exceptions.PageError as e:
        print(e)
        return None

In [7]:
# Define the topics and the number of sentences in each sample
topics = ["Natural_language_processing", "Image_Processing", "Artificial_intelligence"]
num_sentences = 2

data = []
for topic in topics:
    topic_data = get_wikipedia_data(topic, num_sentences)
    if topic_data:
        data.extend(topic_data)

In [8]:
# Preprocess the data
data = [preprocess_text(sentence) for sentence in data]
print("Number of samples:", len(data))

Number of samples: 582


In [15]:
data

['natural language processing nlp interdisciplinary subfield computer science information retrieval primarily concerned giving computer ability support manipulate human language',
 'primarily concerned giving computer ability support manipulate human language involves processing natural language datasets text corpus speech corpus using either rulebased probabilistic ie',
 'involves processing natural language datasets text corpus speech corpus using either rulebased probabilistic ie statistical recently neural networkbased machine learning approach',
 'statistical recently neural networkbased machine learning approach goal computer capable understanding content document including contextual nuance language within',
 'goal computer capable understanding content document including contextual nuance language within end natural language processing often borrows idea theoretical linguistics',
 'end natural language processing often borrows idea theoretical linguistics technology accurately 

In [9]:
# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1

In [None]:
# Initialize input_sequences list
input_sequences = []

# Iterate over each line in the data
for line in data:
    # Convert the line to a list of tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Iterate over each token in the token list
    for i in range(1, len(token_list)):
        # Create n-gram sequences and append to input_sequences
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [11]:
# Find the maximum sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad sequences to make them of equal length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors (X) and label (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
# Convert y to one-hot encoded format
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [18]:
print(X.shape,y.shape)

(23173, 215) (23173, 3574)


In [12]:
# Build the SimpleRNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    tf.keras.layers.SimpleRNN(150, return_sequences=True),
    tf.keras.layers.SimpleRNN(150),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
history = model.fit(X, y, epochs=50,batch_size=256, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
# Function to generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [20]:
# Generate text
seed_text = "natural language processing is"
generated_text = generate_text(seed_text, 20, model, max_sequence_len)
print(generated_text)

natural language processing is nlp interdisciplinary subfield computer science information retrieval primarily concerned giving computer ability support manipulate human language involves processing natural language


In [21]:
# Generate text
seed_text = "ai"
generated_text = generate_text(seed_text, 15, model, max_sequence_len)
print(generated_text)

ai research image processing may vastly classify british government university medical imaging definition defined number use


In [22]:
# Generate text
seed_text = "programming"
generated_text = generate_text(seed_text, 10, model, max_sequence_len)
print(generated_text)

programming machine success wa due steady cognitive data classify domain among


In [23]:
# Generate text
seed_text = "data"
generated_text = generate_text(seed_text, 50, model, max_sequence_len)
print(generated_text)

data first robot technology space storage medical eventually wa invested ai around 2022 u statement others noun rather table verb book noted 2016 exists white country technology framework care government framework treatment view technology service used conventional asimov law often brought lay discussion machine ethic almost artificial intelligence researcher familiar asimov


In [24]:
# Generate text
seed_text = "science"
generated_text = generate_text(seed_text, 25, model, max_sequence_len)
print(generated_text)

science lunar photo sent back e old rulebased may see important similar named entity language english wa uncertain first field digital image processing pixellate photography simulate


In [26]:
# Generate text
seed_text = "python"
generated_text = generate_text(seed_text, 15, model, max_sequence_len)
print(generated_text)

python may began expected ai however technique several way amount people gameplaying called chess applied electronic


In [27]:
# Generate text
seed_text = "processing"
generated_text = generate_text(seed_text, 15, model, max_sequence_len)
print(generated_text)

processing computer researcher develops number wa key role model began elaborate conversion shared ai wa blind
