## Chapter 9 -- Working with Unstructured Data:

In [50]:
import boto3
import pandas as pd

In [1]:
AWS_ACCESS_KEY_ID = "add-access-key-here"
AWS_SECRET_ACCESS_KEY = "add-secret-access-key-here"
AWS_REGION = "us-east-2"

#### Upload to S3

In [9]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [17]:
s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)

In [18]:
with open("Monoclonal Production Article.pdf", "rb") as f:
    s3_client.upload_fileobj(f, "biotech-machine-learning", "pdfs/Monoclonal Production Article.pdf")

#### Textract:

In [19]:
textract_client = boto3.client('textract', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)

In [21]:
response = textract_client.start_document_text_detection(
                   DocumentLocation={'S3Object': {'Bucket': "biotech-machine-learning", 'Name': "pdfs/Monoclonal Production Article.pdf"} })

In [23]:
response["JobId"]

'2565768351e2c0d0cb7de81f19053889d1ae45c1d611ae2684b8744be75ad660'

In [27]:
results = textract_client.get_document_text_detection(JobId=response["JobId"])

In [28]:
for i in results:
    print(i)

DocumentMetadata
JobStatus
NextToken
Blocks
DetectDocumentTextModelVersion
ResponseMetadata


In [33]:
results["DocumentMetadata"]

{'Pages': 6}

In [35]:
results["Blocks"][0]

{'BlockType': 'PAGE',
 'Geometry': {'BoundingBox': {'Width': 1.0,
   'Height': 1.0,
   'Left': 0.0,
   'Top': 0.0},
  'Polygon': [{'X': 1.5849614334573464e-16, 'Y': 0.0},
   {'X': 1.0, 'Y': 9.462437987838284e-17},
   {'X': 1.0, 'Y': 1.0},
   {'X': 0.0, 'Y': 1.0}]},
 'Id': '172e7b67-8c6b-4e37-b2af-d5f79c19534f',
 'Relationships': [{'Type': 'CHILD',
   'Ids': ['e086999e-d1de-46b0-96ee-e575c1b44f47',
    '36687f55-0b02-4249-adc2-fec24e49f199',
    '1e36b342-ca48-4262-a19b-c87029f48a57',
    '624b9509-403c-4d72-8e3d-0e6dbf3b8bae',
    '37075da2-2d1d-4642-b3ea-c132795c84b5',
    'e6a89790-bc3f-41a3-958f-59ccb6a325c3',
    'db5353d3-0f3a-4b06-92ac-11f4e485934e',
    'c89dd024-94ca-4df9-873f-b191d98bf566',
    '63ff8019-fc1c-4924-afb0-d41474d0fd5d',
    'ecb3f655-785a-4612-8bfb-2429225fa745',
    '6031d96e-3312-4987-9c36-fc25054e2d65',
    'a68c0549-ad3d-4e74-a9ef-1cc010ce5b5d',
    'a8aba533-1da4-440b-ac2b-0227ccf97902',
    'c2810da2-2522-4ca6-bc2b-5670cb3c6f76',
    '79d5f700-e27d-4049-a59

In [31]:
documentText = ""

for item in results["Blocks"]:
    if item["BlockType"] == "LINE":
        documentText = documentText + item["Text"]

In [32]:
documentText

'Brief Definitive ReportsMONOCLONAL PRODUCTION OF BOTH IgM AND IgG1ANTIHAPTEN ANTIBODY*By JOAN L. PRESS AXD NORMAN R. KLINMAN;(From the Department of Pathology, University of Pennsylonia Medical School,Philadelphia, 19 174)(Received for publication 1 May 1973)Investigations on the clonal nature of antibody formation have demonstrated theexpression of allelie exclusion by lymphoid cells (1), as well as the production of amonospecific, restricted population of antibody molecules by the clonal progenyof a single antibody-forming precursor cell (B cell) (2, 3). Recent studies suggest,however, that a single clone may synthesize antibody molecules of the same speci-ficity which differ in heavy chain class. Thus, idiotypic determinants, which are con-sidered to be a function of the antibody combining site and therefore a variable re-gion marker (4), have been shown to be shared among the IgM and IgG anti-Sal-monella antibodies produced by individual rabbits (5). Furthermore, IgG and IgMmyelom

### AWS Comprehend

In [37]:
comprehend_client = boto3.client('comprehend', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)

In [46]:
response = comprehend_client.detect_entities(
    Text=documentText[:5000],
    LanguageCode='en',
)

In [1]:
response["Entities"]

NameError: name 'response' is not defined

In [53]:
pd.DataFrame(response["Entities"]).sort_values(by='Score', ascending=False).head()

Unnamed: 0,Score,Type,Text,BeginOffset,EndOffset
8,0.997775,DATE,1 May 1973,248,258
17,0.991282,ORGANIZATION,National Institute of Allergy and Infectious,2798,2842
15,0.981901,ORGANIZATION,U.S.Public Health Service,2678,2703
19,0.966133,PERSON,N. R. Klinman,2870,2883
2,0.957784,PERSON,NORMAN R. KLINMAN,107,124


In [55]:
response = comprehend_client.detect_key_phrases(
    Text=documentText[:5000],
    LanguageCode='en',
)

In [57]:
response["KeyPhrases"][0]

{'Score': 0.94112229347229,
 'Text': 'Brief Definitive ReportsMONOCLONAL PRODUCTION',
 'BeginOffset': 0,
 'EndOffset': 45}

In [58]:
response = comprehend_client.detect_sentiment(
    Text=documentText[:5000],
    LanguageCode='en',
)

In [61]:
response

{'Sentiment': 'NEUTRAL',
 'SentimentScore': {'Positive': 0.0010422870982438326,
  'Negative': 0.00028130708960816264,
  'Neutral': 0.9986714124679565,
  'Mixed': 4.950157290295465e-06},
 'ResponseMetadata': {'RequestId': 'ede1155b-dbf4-41f5-a9c8-33d498b61c04',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ede1155b-dbf4-41f5-a9c8-33d498b61c04',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '166',
   'date': 'Tue, 24 Aug 2021 01:54:35 GMT'},
  'RetryAttempts': 0}}

In [63]:
response = comprehend_client.detect_dominant_language(
    Text=documentText[:5000],
)

In [64]:
response

{'Languages': [{'LanguageCode': 'en', 'Score': 0.9832875728607178}],
 'ResponseMetadata': {'RequestId': 'f8f874a8-7631-4e4e-88d6-345348f716fd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f8f874a8-7631-4e4e-88d6-345348f716fd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '64',
   'date': 'Tue, 24 Aug 2021 01:55:23 GMT'},
  'RetryAttempts': 0}}