# Data Import

In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
import re



# Download the nltk data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:

# Define the URL to scrape
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

# Fetch the content from the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract all paragraph tags
paragraphs = soup.find_all('p')

# Create a corpus list
corpus = []

# Extract text from each paragraph and clean it
for para in paragraphs:
    text = para.get_text()
    # Simple cleaning to remove references like [1][2]
    clean_text = ' '.join(text.replace('\n', ' ').split())
    corpus.append(clean_text)

# print corpus
for paragraph in corpus:
    print(paragraph)
    print("---------")
print(len(corpus))

Natural language processing (NLP) is an interdisciplinary subfield of computer science and artificial intelligence. It is primarily concerned with providing computers the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches of machine learning and deep learning.
---------
Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation.
---------
Natural language processing has its roots in the 1940s.[1] Already in 1940, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial 

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize and normalize case
    tokens = nltk.word_tokenize(text.lower())
    # Remove stop words and punctuations
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return filtered_tokens

# Apply preprocessing to each document in the corpus
processed_corpus = [preprocess(text) for text in corpus]

# Print the processed corpus
for document in processed_corpus:
    print(document)
    print("-----")
print(str(len(processed_corpus)) + ' documents in corpus' )

['natural', 'language', 'processing', 'nlp', 'interdisciplinary', 'subfield', 'computer', 'science', 'artificial', 'intelligence', 'primarily', 'concerned', 'providing', 'computer', 'ability', 'process', 'data', 'encoded', 'natural', 'language', 'thus', 'closely', 'related', 'information', 'retrieval', 'knowledge', 'representation', 'computational', 'linguistics', 'subfield', 'linguistics', 'typically', 'data', 'collected', 'text', 'corpus', 'using', 'either', 'statistical', 'approach', 'machine', 'learning', 'deep', 'learning']
-----
['major', 'task', 'natural', 'language', 'processing', 'speech', 'recognition', 'text', 'classification', 'understanding', 'generation']
-----
['natural', 'language', 'processing', 'root', '1940s', '1', 'already', '1940', 'alan', 'turing', 'published', 'article', 'titled', 'computing', 'machinery', 'intelligence', 'proposed', 'called', 'turing', 'test', 'criterion', 'intelligence', 'though', 'time', 'articulated', 'problem', 'separate', 'artificial', 'int

In [4]:
from gensim.models import Word2Vec

# Initialize and train a Word2Vec model
embedded_words = Word2Vec(sentences=processed_corpus, vector_size=50, window=5, min_count=1, workers=4)
embedded_words.save("word2vec.model")
# Use the model to get the vector of a word
word_vector = embedded_words.wv['nlp']  # 'nlp' is the word for which you want the vector

# Output the vector
print(word_vector)

print(embedded_words.wv.most_similar('nlp'))

[-0.01711663  0.00727925  0.01039552  0.01158247  0.01476423 -0.01260179
  0.00225464  0.01256732 -0.0061258  -0.0124349  -0.00091152 -0.01682046
 -0.0110077   0.01431822  0.00634701  0.01457427  0.01393051  0.01519224
 -0.00788206 -0.00125845  0.00451408 -0.00897556  0.01715351 -0.01984543
  0.01385504  0.00608646 -0.00993883  0.00884507 -0.00406971  0.01363702
  0.02010466 -0.00862695 -0.00138743 -0.01128995  0.00740453  0.00602722
  0.01401703  0.01228605  0.01909166  0.01818954  0.01611374 -0.0140514
 -0.01852454 -0.00046264 -0.00545288  0.01591346  0.01197275 -0.00340572
  0.00306722  0.00361921]
[('speech', 0.3672628104686737), ('best', 0.3532448410987854), ('intermediate', 0.3450145125389099), ('moore', 0.3342324197292328), ('given', 0.3302149772644043), ('healthcare', 0.32867226004600525), ('subdivided', 0.3087236285209656), ('21', 0.3025331497192383), ('experiment', 0.30020180344581604), ('replaced', 0.29874563217163086)]


# RNN

Predicting the next word in a text with the straight forward aproach of assigning index to each word.

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN
from tensorflow.keras.utils import to_categorical
import numpy as np

# 'processed_corpus' is available from previous steps

# Tokenization and sequence creation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_corpus)
sequences = tokenizer.texts_to_sequences(processed_corpus)
print(sequences)


# Create input sequences and labels
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence)
print(input_sequences)

# Pad sequences to ensure uniform input size
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print(len(input_sequences))
print(len(input_sequences[0])) # all sequences in the same lenght

# Prepare predictors and labels
predictors, labels = input_sequences[:,:-1],input_sequences[:,-1]
labels = to_categorical(labels, num_classes=len(tokenizer.word_index) + 1)

print(predictors[0])
print(labels[0])


# Building the RNN Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=30, input_length=max_sequence_len - 1),
    SimpleRNN(150, return_sequences=True),
    Dropout(0.2),
    SimpleRNN(100),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Training the model
#model.fit(predictors, labels, epochs=100, verbose=1)

[[4, 1, 6, 2, 24, 45, 25, 26, 18, 10, 101, 102, 103, 25, 104, 27, 28, 105, 4, 1, 106, 46, 107, 108, 109, 29, 47, 19, 8, 45, 8, 110, 28, 111, 30, 48, 31, 112, 9, 3, 11, 12, 49, 12], [32, 7, 4, 1, 6, 113, 114, 30, 115, 33, 50], [4, 1, 6, 116, 117, 118, 119, 120, 121, 51, 122, 123, 124, 125, 126, 10, 52, 127, 51, 53, 128, 10, 54, 55, 129, 130, 131, 18, 10, 52, 53, 132, 7, 133, 134, 135, 50, 4, 1], [136, 20, 2, 137, 138, 56, 139, 140, 57, 141, 13, 56, 142, 143, 144, 145, 25, 146, 4, 1, 33, 2, 7, 147, 13, 28, 148], [34, 4, 1, 6, 35, 21, 149, 58, 13, 150, 59, 34, 60, 151, 4, 1, 6, 61, 11, 12, 36, 1, 6, 62, 152, 153, 19, 154, 63, 155, 156, 157, 158, 159, 160, 161, 8, 162, 16, 163, 164, 165, 166, 167, 48, 8, 168, 3, 1, 6, 169], [170, 14, 17, 55, 171, 9, 36, 172, 173, 64, 22, 37, 65, 174, 175, 14, 176, 66, 177, 14, 178, 179, 1, 67, 180, 181, 182], [183, 184, 185, 186, 187, 188, 68, 69, 70, 189, 190, 15, 23, 64, 22, 37, 1, 67, 191, 71, 192, 193, 72, 194, 195, 47, 12, 49, 15, 196, 38, 22, 37, 11,

In [6]:
def predict_next_word(model, tokenizer, seed_text, max_sequence_len):
    # Tokenize and pad the seed text to fit the model's input format
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Predict the class probabilities for the next word
    probabilities = model.predict(token_list, verbose=0)[0]

    # Get the index of the most probable next word
    predicted_index = np.argmax(probabilities)

    # Map the index to the corresponding word
    predicted_word = None
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            predicted_word = word
            break

    return predicted_word

seed_text = "Deep learning models"
for i in range(10):

    next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
    seed_text += " " + next_word
    print("text: ", seed_text)


text:  Deep learning models approach
text:  Deep learning models approach include
text:  Deep learning models approach include statistical
text:  Deep learning models approach include statistical neural
text:  Deep learning models approach include statistical neural network
text:  Deep learning models approach include statistical neural network hand
text:  Deep learning models approach include statistical neural network hand many
text:  Deep learning models approach include statistical neural network hand many advantage
text:  Deep learning models approach include statistical neural network hand many advantage symbolic
text:  Deep learning models approach include statistical neural network hand many advantage symbolic approach


# RNN - word2vec

In [21]:
# Assuming embedded_words is your Word2Vec model
word_vectors = embedded_words.wv  # Access the word vectors directly

# Prepare the embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, word_vectors.vector_size), dtype='float32')
for word, i in tokenizer.word_index.items():
    # Check if the word is in the word_vectors
    if word in word_vectors.key_to_index:
        # If word is in the model, retrieve the corresponding vector
        embedding_matrix[i] = word_vectors[word]


# Building the RNN Model with Word2Vec embeddings
model_word2vec_RNN = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=word_vectors.vector_size, weights=[embedding_matrix], input_length=max_sequence_len - 1, trainable=True),
    SimpleRNN(150, return_sequences=True),
    Dropout(0.2),
    SimpleRNN(100),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model_word2vec_RNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_word2vec_RNN.summary())
# Training the model
model_word2vec_RNN.fit(predictors, labels, epochs=100, verbose=1)


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 106, 50)           20750     
                                                                 
 simple_rnn_12 (SimpleRNN)   (None, 106, 150)          30150     
                                                                 
 dropout_7 (Dropout)         (None, 106, 150)          0         
                                                                 
 simple_rnn_13 (SimpleRNN)   (None, 100)               25100     
                                                                 
 dense_7 (Dense)             (None, 415)               41915     
                                                                 
Total params: 117915 (460.61 KB)
Trainable params: 117915 (460.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/1

<keras.src.callbacks.History at 0x7df0c9c5b310>

In [22]:
seed_text = "Deep learning models"
for i in range(10):

    next_word = predict_next_word(model_word2vec_RNN, tokenizer, seed_text, max_sequence_len)
    seed_text += " " + next_word
    print("text: ", seed_text)



text:  Deep learning models approach
text:  Deep learning models approach include
text:  Deep learning models approach include statistical
text:  Deep learning models approach include statistical neural
text:  Deep learning models approach include statistical neural network
text:  Deep learning models approach include statistical neural network hand
text:  Deep learning models approach include statistical neural network hand many
text:  Deep learning models approach include statistical neural network hand many advantage
text:  Deep learning models approach include statistical neural network hand many advantage symbolic
text:  Deep learning models approach include statistical neural network hand many advantage symbolic approach


now i will try to train with LSTM and word2vec embedding to see differences.

In [9]:
# Assuming embedded_words is your Word2Vec model
word_vectors = embedded_words.wv  # Access the word vectors directly

# Prepare the embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, word_vectors.vector_size), dtype='float32')
for word, i in tokenizer.word_index.items():
    # Check if the word is in the word_vectors
    if word in word_vectors.key_to_index:
        # If word is in the model, retrieve the corresponding vector
        embedding_matrix[i] = word_vectors[word]


# Building the RNN Model with Word2Vec embeddings
model_word2vec_LSTM = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=word_vectors.vector_size, weights=[embedding_matrix], input_length=max_sequence_len - 1, trainable=True),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model_word2vec_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_word2vec_LSTM.summary())
# Training the model
model_word2vec_LSTM.fit(predictors, labels, epochs=100, verbose=1)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 106, 50)           20750     
                                                                 
 lstm (LSTM)                 (None, 106, 150)          120600    
                                                                 
 dropout_2 (Dropout)         (None, 106, 150)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense_2 (Dense)             (None, 415)               41915     
                                                                 
Total params: 283665 (1.08 MB)
Trainable params: 283665 (1.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/100
E

<keras.src.callbacks.History at 0x7df0d1bc89a0>

In [23]:
seed_text = "Deep learning models"
for i in range(10):

    next_word = predict_next_word(model_word2vec_LSTM, tokenizer, seed_text, max_sequence_len)
    seed_text += " " + next_word
    print("text: ", seed_text)

text:  Deep learning models approach
text:  Deep learning models approach based
text:  Deep learning models approach based transformation
text:  Deep learning models approach based transformation made
text:  Deep learning models approach based transformation made obsolete
text:  Deep learning models approach based transformation made obsolete room
text:  Deep learning models approach based transformation made obsolete room intermediate
text:  Deep learning models approach based transformation made obsolete room intermediate given
text:  Deep learning models approach based transformation made obsolete room intermediate given collection
text:  Deep learning models approach based transformation made obsolete room intermediate given collection rule


RNN or not, not a lot of difference, lets
#Evaluate

In [24]:
print(max_sequence_len)

107


In [40]:
from tensorflow.keras.losses import categorical_crossentropy
test_data = "machine learning is a field of study in artificial intelligence concerned with the development and"

# Tokenize the text
token_list = tokenizer.texts_to_sequences([test_data])[0]

# Ensure token list has enough tokens to form a sequence and a label
if len(token_list) > 1:
    # Prepare input sequences and labels
    input_sequences = []
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

    # Pad sequences to ensure uniform length
    max_sequence_len = max(len(x) for x in input_sequences)  # Find max length to pad
    padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

    # Predictors and actual next words
    predictors = padded_sequences[:, :-1]
    labels = padded_sequences[:, -1]

    # Predict the probability distribution for each input sequence
    predicted_probs_LSTM = model_word2vec_LSTM.predict(predictors, verbose=0)
    predicted_probs_RNN = model_word2vec_RNN.predict(predictors, verbose=0)

    # Convert labels to categorical for all words in each sequence
    actual_labels = to_categorical(labels, num_classes=len(tokenizer.word_index) + 1)

    # Calculate cross-entropy loss for each prediction against its actual next word
    loss_LSTM = tf.keras.losses.categorical_crossentropy(actual_labels, predicted_probs_LSTM)
    loss_RNN = tf.keras.losses.categorical_crossentropy(actual_labels, predicted_probs_RNN)
    perplexity_LSTM = tf.exp(tf.reduce_mean(loss_LSTM))
    perplexity_RNN = tf.exp(tf.reduce_mean(loss_RNN))
    print("Perplexity_LSTM:", perplexity_LSTM.numpy())
    print("Perplexity_RNN:", perplexity_RNN.numpy())
else:
    print("Not enough data to predict the next word.")


Perplexity_LSTM: 1753.0775
Perplexity_RNN: 4364.5254


On test data that is not part of the corpus, the LSTM makes better predictions.