In [37]:
pip install tensorflow-gpu




In [38]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

In [39]:
def get_pdf_file_content(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [40]:
path_to_pdf = "./1220.pdf"

In [41]:
 #print(get_pdf_file_content(path_to_pdf))

In [42]:
import tensorflow as tf
from tensorflow.keras import layers

In [43]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [44]:
sentence = get_pdf_file_content(path_to_pdf)
tokens = list(sentence.lower().split())
print(len(tokens))

3804


In [45]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'http://judis.nic.in': 1, 'supreme': 2, 'court': 3, 'of': 4, 'india': 5, 'page': 6, '1': 7, '7': 8, 'petitioner:': 9, 'commissioner': 10, 'income-tax,west': 11, 'bengal': 12, 'vs.': 13, 'respondent:': 14, 'calcutta': 15, 'agency': 16, 'ltd.': 17, 'date': 18, 'judgment:': 19, '21/12/1950': 20, 'bench:': 21, 'kania,': 22, 'hiralal': 23, 'j.': 24, '(cj)': 25, 'sastri,': 26, 'm.': 27, 'patanjali': 28, 'das,': 29, 'sudhi': 30, 'ranjan': 31, 'citation:': 32, '1951': 33, 'air': 34, '108': 35, '1950': 36, 'scr': 37, '1008': 38, 'citator': 39, 'info': 40, ':': 41, 'd': 42, '1965': 43, 'sc1905': 44, '(6)': 45, 'r': 46, '1976': 47, 'sc': 48, '772': 49, 'act:': 50, 'lndian': 51, 'income-tax': 52, 'act': 53, '(xi': 54, '1922),': 55, 'ss.': 56, '10': 57, '(2)': 58, '(xv),': 59, '66--reference--jurisdiction': 60, 'high': 61, 'court--duty': 62, 'to': 63, 'decide': 64, 'case': 65, 'on': 66, 'facts': 67, 'stated': 68, 'by': 69, 'tribunal--accepting': 70, 'arguments': 71, 'counsel': 72, 'as'

In [46]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'http://judis.nic.in', 2: 'supreme', 3: 'court', 4: 'of', 5: 'india', 6: 'page', 7: '1', 8: '7', 9: 'petitioner:', 10: 'commissioner', 11: 'income-tax,west', 12: 'bengal', 13: 'vs.', 14: 'respondent:', 15: 'calcutta', 16: 'agency', 17: 'ltd.', 18: 'date', 19: 'judgment:', 20: '21/12/1950', 21: 'bench:', 22: 'kania,', 23: 'hiralal', 24: 'j.', 25: '(cj)', 26: 'sastri,', 27: 'm.', 28: 'patanjali', 29: 'das,', 30: 'sudhi', 31: 'ranjan', 32: 'citation:', 33: '1951', 34: 'air', 35: '108', 36: '1950', 37: 'scr', 38: '1008', 39: 'citator', 40: 'info', 41: ':', 42: 'd', 43: '1965', 44: 'sc1905', 45: '(6)', 46: 'r', 47: '1976', 48: 'sc', 49: '772', 50: 'act:', 51: 'lndian', 52: 'income-tax', 53: 'act', 54: '(xi', 55: '1922),', 56: 'ss.', 57: '10', 58: '(2)', 59: '(xv),', 60: '66--reference--jurisdiction', 61: 'high', 62: 'court--duty', 63: 'to', 64: 'decide', 65: 'case', 66: 'on', 67: 'facts', 68: 'stated', 69: 'by', 70: 'tribunal--accepting', 71: 'arguments', 72: 'counsel', 73: 

In [47]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 4, 8, 9, 10, 4, 11, 12, 13, 14, 15, 16, 17, 18, 4, 19, 20, 21, 22, 23, 24, 25, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 45, 50, 51, 52, 53, 54, 4, 55, 56, 57, 58, 59, 60, 4, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 4, 72, 73, 74, 67, 75, 76, 77, 66, 78, 79, 80, 81, 82, 63, 83, 84, 4, 85, 4, 86, 4, 87, 88, 89, 90, 4, 89, 61, 3, 91, 89, 92, 4, 93, 94, 95, 96, 97, 90, 75, 98, 89, 99, 53, 89, 77, 4, 89, 100, 101, 66, 67, 95, 102, 103, 104, 105, 106, 107, 108, 66, 89, 109, 110, 111, 112, 113, 114, 115, 116, 89, 117, 66, 67, 118, 69, 89, 119, 104, 95, 120, 89, 121, 4, 89, 61, 3, 63, 122, 69, 123, 124, 89, 67, 125, 69, 89, 101, 75, 126, 89, 127, 4, 128, 66, 110, 129, 104, 95, 130, 131, 63, 132, 133, 134, 135, 4, 128, 73, 104, 136, 137, 89, 61, 3, 138, 139, 140, 141, 142, 143, 104, 95, 144, 98, 89, 97, 145, 73, 89, 146, 4, 147, 65, 148, 69, 89, 100, 101, 91, 149, 150, 89, 151, 152, 98, 89, 153, 

In [48]:
window_size = 2 #chosing window size as 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

15210


In [49]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(72, 74): (counsel, poved)
(698, 172): (appellant, agreed)
(96, 266): (an, laid)
(682, 150): (view, with)
(137, 128): (convert, law)


In [50]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    #seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([  5  42 216   0], shape=(4,), dtype=int64)
['india', 'd', 'deduct', '<pad>']


In [51]:
# Add a dimension so you can use concatenation (in the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape the target to shape `(1,)` and context and label to `(num_ns+1,)`.
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [52]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 72
target_word     : counsel
context_indices : [ 74   5  42 216   0]
context_words   : ['poved', 'india', 'd', 'deduct', '<pad>']
label           : [1 0 0 0 0]


In [53]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : tf.Tensor(72, shape=(), dtype=int32)
context : tf.Tensor([ 74   5  42 216   0], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [54]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]
