In [None]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=021ece33cad5b62117496c97252a7c50bb77227e889f82141ac471910306550c
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import wikipedia
import tensorflow as tf
import numpy as np
import re

# Define topics
topics = [
    'Data Science',
    'Artificial Intelligence',
    'Neural Networks',
    'Natural Language Processing',
    'Deep Learning',
    'Reinforcement Learning',
    'Big Data',
    'Statistics',
    'Python Programming Language',
    'Politics',
    'Business',
    'Physics',
    'Mathematics',
    'Tesla Motors',
    'OpenAI',
    'C++ programming Language',
    'Military'
]

topic_contents = []
for topic in topics:
    try:
        content = wikipedia.page(topic).content

        if content:
            #Limit content to 200 words (it can be changed as per the need)
            limited_content = ' '.join(content.split()[:200])
            topic_contents.append(limited_content)
        else:
            print(f"Empty content for {topic}. Skipping.")
    except wikipedia.exceptions.PageError:
        print(f"Page for {topic} not found.")
    except wikipedia.exceptions.DisambiguationError:
        print(f"Ambiguous page for {topic}.")

#Combine and clean text data
combined_content = ' '.join(topic_contents)
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', combined_content).lower()

# Tokenize
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
tokens = tokenizer.texts_to_sequences([cleaned_text])[0]

#Create sequences of input and output data
seq_length = 30  #Limiting sequence length to 30 words
sequences = []
for i in range(0, len(tokens) - seq_length, 1):
    sequences.append(tokens[i:i + seq_length + 1])

sequences = np.array(sequences)

#Shuffle the sequences
np.random.shuffle(sequences)

#Split sequences into input and target
X = sequences[:, :-1]
y = sequences[:, -1]

print("Total number of tokens:", len(tokens))
print("Vocabulary size:", len(tokenizer.word_index))

#Check data validity
if sequences.any():
    print("Dataset shape:", sequences.shape)
else:
    print("The dataset is empty or contains invalid data.")




  lis = BeautifulSoup(html).find_all('li')


Ambiguous page for Physics.
Total number of tokens: 3136
Vocabulary size: 1140
Dataset shape: (3106, 31)


In [None]:
vocab_size = len(tokenizer.word_index) + 1  #Vocabulary size (+1 for padding)
embedding_dim = 256
rnn_units = 512

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
    tf.keras.layers.LSTM(256, return_sequences=True, activation='tanh'),
    tf.keras.layers.LSTM(512, return_sequences=True, activation='tanh'),
    tf.keras.layers.LSTM(512, activation='tanh'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')])

#Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

#Model Training
model.fit(X, y, batch_size=64, epochs=20, validation_split=0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7be1404c9ed0>

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 256)           292096    
                                                                 
 lstm_6 (LSTM)               (None, 30, 256)           525312    
                                                                 
 lstm_7 (LSTM)               (None, 30, 512)           1574912   
                                                                 
 lstm_8 (LSTM)               (None, 512)               2099200   
                                                                 
 dense_2 (Dense)             (None, 1141)              585333    
                                                                 
Total params: 5076853 (19.37 MB)
Trainable params: 5076853 (19.37 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
#Function to generate text using the trained model
def generate_text_with_temperature(topic, next_words, temperature=1.0):
    generated_text = topic.lower()
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted = model.predict(token_list, verbose=0)[-1]  # Last prediction
        predicted = np.log(predicted) / temperature
        exp_preds = np.exp(predicted)
        predicted = exp_preds / np.sum(exp_preds)

        predicted_index = np.random.choice(len(predicted), p=predicted)
        output_word = tokenizer.index_word.get(predicted_index, "")

        generated_text += " " + output_word
    return generated_text

#Prompt the model for information related to a topic with temperature control
topic = input('Ask me from the listed topics.....\n')
generated_info = generate_text_with_temperature(topic, next_words=50, temperature=1)
print(generated_info)


Ask me from the listed topics.....
Tesla Motors
tesla motors diverse internal it in the of greek underlying naturallanguage forms may intelligence identify was to cannot elon premise as a and concerns games led refers as business knowledge nature nonprofit or and andin fundamental such subfield goal naturallanguage data usually a not history the range swarm programs machine learning datadriven


In [None]:
from language_tool_python import LanguageTool

# Assuming `generated_text` contains the output generated by your model
generated_text = "Your generated text here..."

# Create a LanguageTool instance
tool = LanguageTool('en-US')

# Check and correct grammar errors in the generated text
matches = tool.check(generated_text)
corrected_text = tool.correct(generated_info)

# Print corrected text
print("Original Text:")
print(generated_info)
print("\nCorrected Text:")
print(corrected_text)

Original Text:
tesla motors diverse internal it in the of greek underlying naturallanguage forms may intelligence identify was to cannot elon premise as a and concerns games led refers as business knowledge nature nonprofit or and andin fundamental such subfield goal naturallanguage data usually a not history the range swarm programs machine learning datadriven

Corrected Text:
Tesla motors diverse internal it in the of Greek underlying natural language forms may intelligence identify was to cannot Elon premise as a and concerns games led refers as business knowledge nature nonprofit or and an din fundamental such subfield goal natural language data usually a not history the range swarm programs machine learning data driven


In [None]:
import pickle as pe
tokenizer_file = 'tokenizer_model.pkl'
with open(tokenizer_file, 'wb') as token_file:
    pe.dump(tokenizer, token_file)

In [None]:
#Save the model
model.save("Chat_pedia_model.h5")

#Load the model
#loaded_model = tf.keras.models.load_model("Chat_pedia_model.h5")


In [None]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1


In [7]:
import pickle

with open('tokenizer_model.pkl', 'rb') as file:
    tokenizer1 = pickle.load(file)


In [12]:
import json

tokenizer_config = tokenizer1.to_json()

with open('tokenizer_config.json', 'w') as json_file:
    json.dump(tokenizer_config, json_file)

In [9]:
tokenizer1

<keras.src.preprocessing.text.Tokenizer at 0x7d574ef5a590>