In [1]:
import os
import re
import sys
import time
import random
import pickle
import datetime
import pandas as pd
from pprint import pprint

%matplotlib inline

In [2]:
import PyPDF2
from tqdm import tqdm

In [3]:
with open('data/ramayan_pages_as_list.pkl', 'rb') as file:
    data = pickle.load(file)

In [4]:
data = re.sub(pattern='\s{2,}', repl=' ', string=data)

In [5]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Process the long paragraph with spaCy
doc = nlp(data)

# Extract sentences from the processed document
sentences = [sent.text for sent in doc.sents]

# The 'sentences' list now contains each sentence as a separate element
final_data = [[sent] for sent in sentences]

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

2024-03-06 21:08:39.395758: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
corpus = [sent[0] for sent in final_data]

In [8]:
len(corpus)

3781

In [9]:
corpus[30:36]

['Please purchase only authorized electronic editions, and do not participate in or encourage electronic piracy of copyrighted materials.',
 'Your support of the author’s rights is appreciated.',
 'http://us.penguingroup.com',
 'Introduction',
 'In the summer of 1988 sanitation workers across North India went on strike.',
 'Their demand was simple: they wanted the federal government to sponsor more episodes of a television serial based on the Indian epic Ramayana (Romance of Rama).']

In [10]:
# Convert the corpus to a sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

In [11]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'in': 6,
 'his': 7,
 'he': 8,
 '”': 9,
 'you': 10,
 'that': 11,
 'i': 12,
 'was': 13,
 'rama': 14,
 'with': 15,
 'it': 16,
 'is': 17,
 'her': 18,
 'this': 19,
 'for': 20,
 'on': 21,
 'him': 22,
 'be': 23,
 'not': 24,
 'at': 25,
 'your': 26,
 'have': 27,
 'as': 28,
 'all': 29,
 'had': 30,
 'my': 31,
 'but': 32,
 'she': 33,
 'when': 34,
 'me': 35,
 'will': 36,
 'by': 37,
 'said': 38,
 'are': 39,
 'who': 40,
 'one': 41,
 'from': 42,
 'an': 43,
 'they': 44,
 'their': 45,
 'now': 46,
 'ravana': 47,
 'has': 48,
 'sita': 49,
 'or': 50,
 'no': 51,
 'out': 52,
 'if': 53,
 'were': 54,
 'do': 55,
 'what': 56,
 'so': 57,
 'we': 58,
 'which': 59,
 'there': 60,
 'them': 61,
 'rama’s': 62,
 'could': 63,
 'up': 64,
 'into': 65,
 'go': 66,
 'time': 67,
 'back': 68,
 'lakshmana': 69,
 'king': 70,
 'our': 71,
 'after': 72,
 'can': 73,
 'would': 74,
 'sugreeva': 75,
 'himself': 76,
 'how': 77,
 'should': 78,
 'only': 79,
 'come': 80,
 'before': 81,
 'any

In [12]:
len(tokenizer.word_index)

7583

In [18]:
# Define the parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 50
window_size = 4

In [19]:
# Generate the context-target pairs
contexts = []
targets = []
for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

In [20]:
contexts[0], targets[0]

([2545, 4, 3794, 120, 2546, 2547, 2548, 1952], 1)

In [21]:
# Convert the contexts and targets to numpy arrays
X = np.array(contexts)
y = np.array(targets)

In [26]:
# Define the CBOW model
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=embedding_size,
    input_length=2*window_size))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(units=vocab_size, activation='softmax'))
# model.compile(optimizer, loss)

In [29]:
# model.fit(
#     x=X,
#     y=y,
#     batch_size=10,
#     epochs=10,
#     verbose=['auto', 0, 1, 2][-1],
#     callbacks=None,
#     validation_split=0.2,
#     validation_data=None,
#     shuffle=False,
#     class_weight=None,
#     sample_weight=None,
#     initial_epoch=0,
#     steps_per_epoch=None,
#     validation_steps=None,
#     validation_batch_size=None,
#     validation_freq=1,
#     max_queue_size=10,
#     workers=-1,
#     use_multiprocessing=True,
# )

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [None]:
# from tensorflow.keras.utils import plot_model

# plot_model(model, to_file='cbow_model.png', show_shapes=True)  # Optional: save to file and show shapes

In [None]:
model.save_weights('ramayan_cbow_weights.h5')

In [None]:
# Load the pre-trained weights
model.load_weights('ramayan_cbow_weights.h5')

In [None]:
 # Get the word embeddings
embeddings = model.get_weights()[0]

In [None]:
# Perform PCA to reduce the dimensionality of the embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

In [None]:
# # Visualize the embeddings
# plt.figure(figsize=(300, 300))
# # for i, word in enumerate(tokenizer.word_index.keys()):
# for i, word in enumerate(list(tokenizer.word_index.keys())):
#     x, y = reduced_embeddings[i]
#     plt.scatter(x, y)
#     plt.annotate(word, xy=(x, y), xytext=(5, 2),
#                  textcoords='offset points',
#                  ha='right', va='bottom')

# # plt.show()
# plt.savefig('ramayan_word_embedding_plot.png')

In [None]:
model.get_weights()[0].shape