<a href="https://colab.research.google.com/github/RajMV05102004/DeepLearning/blob/MiniProjects/WikiWordPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd

# Read the file assuming it's space-separated or tab-separated
df = pd.read_csv(r"/content/wiki.train.tokens", sep="\t", header=None, names=["Text"])



In [31]:
valid_df=pd.read_csv(r"/content/wiki.valid.tokens", sep="\t", header=None, names=["Text"])

In [5]:
df.size

591040

In [19]:
df=df.iloc[:500,:]

In [7]:
valid_df.size

2461

In [8]:
valid_df.sample(5)

Unnamed: 0,Text
2181,"Starr has performed "" Back Off Boogaloo "" in ..."
1792,"= = = New Zealand Wars , 1861 – 64 = = ="
933,In the 5th century the power of the city reac...
1351,"During Hu 's lifetime , the Ming dynasty , wh..."
1032,Stela 43 is paired with Altar 35 . It is a pl...


In [9]:
def preprocess(df,col):
  #Replacing unk with empty string
  df[col] = df[col].str.replace("<unk>", "")
  # Remove all special characters using regex
  df[col] = df[col].str.replace(r"[^a-zA-Z\s]", "", regex=True)
  #Convert everything to lowercase
  df[col]=df[col].str.lower()


In [10]:
preprocess(df,'Text')

In [11]:
preprocess(valid_df,'Text')

In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
tokenizer=Tokenizer()

In [14]:
def fit_tokenizer(obj,tokenizer,col):
  tokenizer.fit_on_texts(obj[col])


In [15]:
fit_tokenizer(df,tokenizer,'Text')

In [16]:
fit_tokenizer(valid_df,tokenizer,'Text')

In [17]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'in': 4,
 'to': 5,
 'a': 6,
 'was': 7,
 's': 8,
 'on': 9,
 'as': 10,
 'that': 11,
 'for': 12,
 'with': 13,
 'by': 14,
 'at': 15,
 'were': 16,
 'from': 17,
 'is': 18,
 'it': 19,
 'he': 20,
 'his': 21,
 'had': 22,
 'an': 23,
 'which': 24,
 'their': 25,
 'this': 26,
 'but': 27,
 'not': 28,
 'be': 29,
 'one': 30,
 'two': 31,
 'after': 32,
 'they': 33,
 'first': 34,
 'also': 35,
 'its': 36,
 'during': 37,
 'been': 38,
 'are': 39,
 'have': 40,
 'new': 41,
 'her': 42,
 'has': 43,
 'who': 44,
 'th': 45,
 'into': 46,
 'she': 47,
 'us': 48,
 'or': 49,
 'time': 50,
 'other': 51,
 'city': 52,
 'all': 53,
 'when': 54,
 'over': 55,
 'war': 56,
 'while': 57,
 'south': 58,
 'team': 59,
 'm': 60,
 'would': 61,
 'more': 62,
 'i': 63,
 'three': 64,
 'between': 65,
 'there': 66,
 'later': 67,
 'route': 68,
 'against': 69,
 'state': 70,
 'north': 71,
 'australian': 72,
 'song': 73,
 'some': 74,
 'about': 75,
 'may': 76,
 'only': 77,
 'part': 78,
 'out': 79,
 'where': 80,
 '

In [None]:
tokenizer.word_counts

In [19]:
len(tokenizer.word_index)

17373

In [20]:
Train_data=df.copy()# Storing the original DataFrame
Valid_data=df.copy()# Storing the Validity DataFrame

In [21]:
#We are creating a dataset where a sequence of words are stored in an non-decreasing manner
def createDataset(df,tokenizer):
  input_sequence=[]

  for sentence in df["Text"]:
    tokennized_sentence=tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1,len(tokennized_sentence)):
      n_gram=tokennized_sentence[:i+1]
      input_sequence.append(n_gram)
  return input_sequence

In [22]:
input_sequence=createDataset(df,tokenizer)

In [23]:
valid_input_sequence=createDataset(valid_df,tokenizer)

In [None]:
valid_input_sequence[:]

In [25]:
len(valid_input_sequence)

172638

In [26]:
#We need the maximum length in the input sequence
valid_maxlen=max(len(x) for x in valid_input_sequence)

In [27]:
#We need the maximum length in the input sequence
maxlen=max(len(x) for x in input_sequence)

In [28]:
maxlen# This is the  maximum size of the input sequence

351

In [29]:
valid_maxlen

346

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
def padding(sequence,maxlen):
  #now we will pad the input sequences to the maxlen
  return pad_sequences(sequence,maxlen=maxlen,padding='pre')

In [32]:
input_padded_sequence=padding(input_sequence,maxlen)
valid_padded_sequence=padding(valid_input_sequence,valid_maxlen)

In [33]:
import pickle

#Saving the processed padded sequence in a pickle file

In [34]:
with open('input_padded_sequence.pkl', 'wb') as f:
    pickle.dump((input_padded_sequence, tokenizer), f)

In [35]:
with open('valid_padded_sequence.pkl', 'wb') as f:
    pickle.dump((valid_padded_sequence, tokenizer), f)

# Preprocessing is done till here
Now loading the Test and Validation sequences

In [1]:
import pickle

In [2]:
with open('/content/input_padded_sequence.pkl', 'rb') as f:
    input_padded_sequence, tokenizer = pickle.load(f)


In [3]:
# with open('/content/valid_padded_sequence.pkl', 'rb') as f:
#     valid_padded_sequence, tokenizer = pickle.load(f)


In [4]:
X_train=input_padded_sequence[:,:-1]
y_train=input_padded_sequence[:,-1]
# #y=np.expand_dims(y,axis=1)
# X_val=valid_padded_sequence[:,:-1]
# y_val=valid_padded_sequence[:,-1]

In [5]:
from tensorflow.keras.utils import to_categorical

# Input:

1. y: The labels (target values) for your dataset. These are typically integer-encoded class labels (e.g., [0, 1, 2, ...]).

2. tokenizer: A tokenizer object (e.g., from Keras' Tokenizer class) that has been fitted on the text data. It contains the vocabulary and word-to-index mappings.

# Purpose:

The function converts the integer-encoded labels (y) into a one-hot encoded format, which is required for multiclass classification problems when using a softmax activation in the output layer.

# to_categorical:

1. This is a utility function from Keras (keras.utils.to_categorical) that converts a class vector (integers) into a binary class matrix (one-hot encoding).

2. For example, if y = [0, 1, 2] and num_classes=3, the output will be:
  [[1., 0., 0.],
  [0., 1., 0.],
  [0., 0., 1.]]
  num_classes=len(tokenizer.word_index)+1:

  len(tokenizer.word_index) gives the size of the vocabulary (number of unique words).

3. +1 is added to account for padding or unknown tokens (if any).

4. This ensures that the one-hot encoded vectors have the correct dimensionality, matching the number of classes (words in the vocabulary).

In [6]:
def preprocess_labels(y,tokenizer):
  #Applying categorical transformation to make it a multiclass classification problem
  return to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [7]:
y_train=preprocess_labels(y_train,tokenizer)
# y_val=preprocess_labels(y_val,tokenizer)

Model1 Structure:
1. Embedding Layer
2. Bidirectional LSTM layer
3. Bidirectional LSTM layer
4. Dense Layer

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,Dense

In [9]:
model1=Sequential()
model1.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=200))
'''
output_dim: This is the size of the word vectors (embeddings). You're setting it to 200, meaning each word will be represented by a 200-dimensional vector.
This layer converts each word index (from the tokenizer) into a dense embedding vector
'''
model1.add(Bidirectional(LSTM(256,return_sequences=True)))
model1.add(Bidirectional(LSTM(256)))
model1.add(Dense(len(tokenizer.word_index)+1,activation='softmax'))

Summary of the Model
Input: Integer-encoded sequences of words (from the tokenizer).

Embedding Layer: Converts words into dense 200-dimensional vectors.

Bidirectional LSTMs: Two layers of bidirectional LSTMs process the sequence to capture contextual information.

Output Layer: A dense layer with softmax activation predicts the next word (or class) based on the processed sequence.

In [10]:
model1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [53]:
model1.summary()

In [11]:
model1.fit(X_train,y_train,batch_size=128,epochs=80)

Epoch 1/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 265ms/step - accuracy: 0.0796 - loss: 7.7720
Epoch 2/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 285ms/step - accuracy: 0.0896 - loss: 6.5993
Epoch 3/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 288ms/step - accuracy: 0.1111 - loss: 6.3353
Epoch 4/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 286ms/step - accuracy: 0.1261 - loss: 6.1232
Epoch 5/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 287ms/step - accuracy: 0.1406 - loss: 5.9643
Epoch 6/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 287ms/step - accuracy: 0.1529 - loss: 5.8222
Epoch 7/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 287ms/step - accuracy: 0.1562 - loss: 5.7065
Epoch 8/80
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 287ms/step - accuracy: 0.1656 - loss: 5.5650
Epoch 9/80
[1m2

<keras.src.callbacks.history.History at 0x78550c4eb590>

In [12]:
model1.save('model.h5')  # Saves architecture, optimizer state, and weights




In [13]:
len(tokenizer.word_index)

17373

In [29]:
# prompt: import pad sequences,numpy

import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [26]:
#General Method for testing
def Predict(text,model):
  for i in range(5):
    #tokenize
    token_text=tokenizer.texts_to_sequences([text])
    #token_text[0]
    #Padding
    padded_token_text=pad_sequences(token_text,maxlen=len(tokenizer.word_index)+1,padding='pre')
    #padded_token_text
    #Predict
    arg=np.argmax(model.predict(padded_token_text)[0])
    for word,index in tokenizer.word_index.items():
      if index==arg:
        text=text+' '+word
        print(text)
        break

In [24]:
df.sample(40)

Unnamed: 0,Text
358,"As of September 2015 , Fernandez has several ..."
68,Lt. Col. Dunnington continued to build up his...
47,The Little Rock Arsenal was classified in 186...
475,"In official writings , pharaohs are said to b..."
410,"At the outbreak of World War I , Erzherzog Fe..."
156,"The Children ’ s Book of Hymns ; Blackie , 19..."
271,= = = Goaltenders = = =
279,= Saves ; Sv % =
55,<unk> <unk> 2
224,The plain maskray generally hunts at the surf...


In [49]:
valid_df.sample(20)

Unnamed: 0,Text
1053,= Mount Elbert =
1768,The scene where SpongeBob and Patrick playing...
1493,= = Legacy = =
2410,Joaquin Martinez as an elderly cigar factory ...
1408,The official unveiling by Linford Christie to...
428,Steven Klein – Photography
449,"Reasons for upgrading the interchange , in ad..."
464,= = Racing career = =
1682,<unk> Price - background vocals
1635,The Butterfly World Tour was the third concer...


In [50]:
Words=['The game takes ','The Tower Building of','The game began','Humans had free will to','In the early','How to']

In [51]:
Valid_Words=['The mountains','The United','The response','The film','Eventually the British ']

In [52]:
for word in Valid_Words:
  Predict(word,model1)
  print('-----------------------------------------------------------')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 773ms/step
The mountains and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 652ms/step
The mountains and the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 661ms/step
The mountains and the pharaoh
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 761ms/step
The mountains and the pharaoh s
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 961ms/step
The mountains and the pharaoh s book
-----------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 654ms/step
The United states
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 672ms/step
The United states and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 676ms/step
The United states and specialized
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 659ms/step
The United states and specialized four
[1m1/1[0m [32m━━━━━━━━━━━━━━

In [30]:
for word in Words:
  Predict(word,model1)
  print('-----------------------------------------------------------')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
The game takes  place
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666ms/step
The game takes  place during
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 657ms/step
The game takes  place during the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 670ms/step
The game takes  place during the request
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 659ms/step
The game takes  place during the request of
-----------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 657ms/step
The Tower Building of the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663ms/step
The Tower Building of the divine
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 932ms/step
The Tower Building of the divine the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 685ms/step
The Tower Building of