# Importing Libraries

In [12]:
import boto3
import os
import time
from trp import Document
from botocore.config import Config
from PIL import Image, ImageDraw

import warnings
warnings.filterwarnings("ignore")


import calendar
import re
import nltk
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from transformers import AutoTokenizer,GPT2TokenizerFast

In [13]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cloudcraftz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/cloudcraftz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/cloudcraftz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/cloudcraftz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Extract text from a image

In [5]:
def image_text_extractor(path:str)->str:
    '''
    This function extracts text from a given image 
    '''
    with open(path, 'rb') as image:
        img = bytearray(image.read())
        client = boto3.client('textract',region_name='ap-south-1')
        response = client.detect_document_text(
            Document={'Bytes': img}
        )
    text = ""
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            text = text + " "+item["Text"]

    return text

In [6]:
image_text_extractor('/home/cloudcraftz/Desktop/Test_3.jpg')

" Multi-Objective reward generalization: Improving performance of Deep Reinforcement Learning for selected applications in stock and cryptocurrency trading Federico Cornalba* Constantin Disselkamp**S Institute of Science and Technology Austria Davide Scassola** federico.cornalba@ist.ac.at Christopher Helf TRALITY GmbH constantin@trality.com davide@trality.com christopher@trality.com ABSTRACT the state variables and action (s,a determine the next state of the We investigate the potential of Multi-Objective, Deep Reinforce- environment that the algorithm will visit. The algorithm's ultimate ment Learning for stock and cryptocurrency trading. More specifi- goal is to maximise a cumulative reward (accounting for all actions cally, we build on the generalized setting à la Fontaine and Friedman taken in a given episode), which is in turn based on a pre-specified [6] (where the reward weighting mechanism is not specified a pri- state-action reward function r (assigning a numerical reward r(s,

# Extract text from a document

In [7]:
## Textract APIs used - "start_document_text_detection", "get_document_text_detection"
def InvokeTextDetectJob(s3BucketName, objectName):
    response = None
    client = boto3.client('textract',region_name='ap-south-1')
    
    
    response = client.start_document_text_detection(
            DocumentLocation={
                      'S3Object': {
                                    'Bucket': s3BucketName,
                                    'Name': objectName,
                                }
           })
    return response["JobId"]

def CheckJobComplete(jobId):
    time.sleep(5)
    client = boto3.client('textract',region_name='ap-south-1')
    response = client.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = client.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    return status

def JobResults(jobId):
    pages = []
    client = boto3.client('textract',region_name='ap-south-1')
    response = client.get_document_text_detection(JobId=jobId)
 
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
        while(nextToken):
            response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
            pages.append(response)
            print("Resultset page recieved: {}".format(len(pages)))
            nextToken = None
            if('NextToken' in response):
                nextToken = response['NextToken']
    return pages

In [8]:
def document_text_extractor(name:str)->list:
    ''' 
    This function takes the name of the document stored in AWS S3 and returns the actual text, the extracted text in a list.
    '''

    extracted_text_cleaned = []
    extracted_text = []

    # S3 Document Data
    s3BucketName = "fintech-sentiment-textract"
    documentName = name

    # Function invokes
    jobId = InvokeTextDetectJob(s3BucketName, documentName)
    print(f"Started job with id: {jobId} , name : {name}")

    if(CheckJobComplete(jobId)):
        response = JobResults(jobId)
        text_out = ''
        for resultPage in response:
            for item in resultPage["Blocks"]:
                if item["BlockType"] == "LINE":
                    text_out = text_out + item["Text"]
                    text_out = text_out.replace("\'s","")

    print(f"For {name}, Text Extracted")

    extracted_text.append(text_out)

    print(f'Text lemmatization and cleaning started.')

    text_out = re.sub('[^a-zA-Z]', ' ', text_out)
    text_out = text_out.lower()
    text_out = ' '.join(text_out.split())

    lemma = WordNetLemmatizer()
    new_corpus = []
    for i in text_out.split():
        words = nltk.word_tokenize(i)
        for word in words:
            if word not in set(stopwords.words('english')):
                new_corpus.append(lemma.lemmatize(word))

    text_out = ' '.join(new_corpus)
    print(f'Final text received')
    extracted_text_cleaned.append(text_out)

    return extracted_text, extracted_text_cleaned


In [9]:
extracted_text, extracted_text_cleaned = document_text_extractor('RL-1.pdf')

Started job with id: 59d790763227539a43b1eb31e6a4867b2bb1d17fb097ca60ad4fb7f2cb034d41 , name : RL-1.pdf
Job status: SUCCEEDED
Resultset page recieved: 1
For RL-1.pdf, Text Extracted
Text lemmatization and cleaning started.
Final text received


In [10]:
extracted_text

['Multi-Objective reward generalization: Improving performanceof Deep Reinforcement Learning for selected applications instock and cryptocurrency tradingFederico Cornalba*Constantin Disselkamp**SInstitute of Science and Technology AustriaDavide Scassola**federico.cornalba@ist.ac.atChristopher HelfTRALITY GmbHconstantin@trality.comdavide@trality.comchristopher@trality.comABSTRACTthe state variables and action (s, determine the next state of theWe investigate the potential of Multi-Objective, Deep Reinforce-environment that the algorithm will visit. The algorithm ultimatement Learning for stock and cryptocurrency trading. More specifi-goal is to maximise a cumulative reward (accounting for all actionscally, we build on the generalized setting à la Fontaine and Friedmantaken in a given episode), which is in turn based on a pre-specified[6] (where the reward weighting mechanism is not specified a pri-state-action reward function r (assigning a numerical reward r(s, a)ori, but embedded in t

In [11]:
extracted_text_cleaned

['multi objective reward generalization improving performanceof deep reinforcement learning selected application instock cryptocurrency tradingfederico cornalba constantin disselkamp sinstitute science technology austriadavide scassola federico cornalba ist ac atchristopher helftrality gmbhconstantin trality comdavide trality comchristopher trality comabstractthe state variable action determine next state thewe investigate potential multi objective deep reinforce environment algorithm visit algorithm ultimatement learning stock cryptocurrency trading specifi goal maximise cumulative reward accounting actionscally build generalized setting la fontaine friedmantaken given episode turn based pre specified reward weighting mechanism specified pri state action reward function r assigning numerical reward r ori embedded learning process complementing itto every pair given state action undertaken algo computational speed ups adding cumulative rewardrithm us several episode accounting explorat