In [None]:
%stop_session

In [None]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 2
%additional_python_modules  datasketch, kshingle, beautifulsoup4,htmldate,wordninja, torch,keybert, transformers, python_docx, docx,spacy, pikepdf, PyPDF2, openpyxl, PyMuPDF, s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/pdfminer.six-20221105-py3-none-any.whl, s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/word_forms-2.1.0-py3-none-any.whl, s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/en_core_web_sm-3.5.0-py3-none-any.whl, s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/en_core_web_lg-3.5.0-py3-none-any.whl
%extra_py_files s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/date_generation.zip, s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/text_hashing.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/document_type_identification.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/docx_to_text.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/html_to_text.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/keyword_extraction.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/legislative_origin.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/odf_to_text.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/pdf_to_text.zip,    s3://aws-glue-assets-455762151948-eu-west-2/notebooks/job_wheels/title_generation.zip

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
logger = glueContext.get_logger()

In [None]:
DATA_SOURCE_PREFIX='temp'
DATA_SOURCE_BUCKET_NAME='beis-orp-dev-datalake'
DTI_RULEBOOK='dti/doc_type_rules_v.2.jsonl'
RESOURCES_BUCKET='s3://aws-glue-assets-455762151948-eu-west-2/notebooks/resources/'
PROCESSED_METADATA_BUCKET=f's3://{DATA_SOURCE_BUCKET_NAME}/glue_processed_metadata/'

import nltk
sc.addFile(RESOURCES_BUCKET,True)

import pandas as pd
import spacy
import json
from pdf_to_text.pdf_to_text import pdf_converter
from odf_to_text.odf_to_text import odf_converter
from docx_to_text.docx_to_text import docx_converter
from html_to_text.html_to_text import html_converter
from text_hashing.hashing import create_hash
from date_generation.date_generation import date_generation
from legislative_origin.lo_extraction import lo_extraction
from title_generation.title_generation import title_generator
from document_type_identification.rule_based_dti import dti
from keyword_extraction.keyword_extraction import  keyword_extraction
import boto3
import io
from pyspark.sql.types import StructType,StructField, StringType,ArrayType
from pyspark import SparkFiles
from datetime import datetime
import pyspark.sql.functions as F
from uuid import uuid4

s3_rsc=boto3.resource('s3')
s3_cli=boto3.client('s3')

doc_format_map = {
    'pdf': pdf_converter,
    'odf': odf_converter,
    'docx': docx_converter,
    # 'doc': docx_converter,
    'html': html_converter
}
md_schema = StructType([
        StructField("raw_uri", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("title", StringType(), True),
        StructField("date_published", StringType(), True),
        StructField("document_uid", StringType(), True),
        StructField("regulator_id", StringType(), True),
        StructField("summary", StringType(), True),
        StructField("document_type", StringType(), True),
        StructField("hash_text", StringType(), True),
        StructField("legislative_origins", ArrayType(
            StructType([
                StructField("title", StringType(), True),
                StructField("ref", StringType(), True),
                StructField("href", StringType(), True),
                StructField("number", StringType(), True),
                StructField("division", StringType(), True),
                StructField("type", StringType(), True)
             ]))),
        StructField("keywords",ArrayType(StringType()), True),
    
        ])

null_ret = ([None]*9)+[[],[]]

In [None]:
rule_json = s3_cli.get_object(Bucket=DATA_SOURCE_BUCKET_NAME, Key=DTI_RULEBOOK)['Body'].read().decode('utf-8')
dti_patterns =[json.loads(line) for line in rule_json.split('\n') if line.strip()]
nlp = spacy.load("en_core_web_sm", exclude=['entity_ruler',  'ner'])
nlp.add_pipe("entity_ruler", config={'phrase_matcher_attr':'LOWER'}).add_patterns(dti_patterns)

def download_text(s3_client, object_key, source_bucket):
        '''Downloads the PDF from S3 ready for conversion and metadata extraction'''

        document = s3_client.get_object(
            Bucket=source_bucket,
            Key=object_key
        )['Body'].read()

        doc_bytes_io = io.BytesIO(document)
        return doc_bytes_io

def get_reg_id(uri, doc_format):
    if doc_format == 'html':
        return 'hse' if 'hse.gov.uk' in uri else 'ea'
    else: return uri.split('/')[1].lower()

def extract_data( uri, doc_format, nlp):
    try:
        s3 = boto3.client('s3')
        nltk.data.path.append(SparkFiles.get('resources/nltk_data'))
        btext = uri if doc_format=='html' else download_text(s3, uri, DATA_SOURCE_BUCKET_NAME)
        text, title, date_published = doc_format_map[doc_format](btext)
        document_uid = uuid4().hex
        reg_id =  get_reg_id(uri, doc_format)
        summary = None
        ntitle = title_generator(text, title)
        ndp = date_generation(text, date_published)
        los = lo_extraction(text)
        keywords = keyword_extraction(text, title)
        document_type = dti(text, ntitle, nlp)
        hash_text = create_hash(text)
        nuri = uri if doc_format=='html' else f'bulk/{uri.split("/")[-1]}'
        return uri, nuri, ntitle, ndp, document_uid, reg_id, summary, document_type, hash_text, los, keywords
    except Exception as e:
        print(f'ERROR: {uri} \t{doc_format}')
        print(f'ERR.BODY:\n{e}')
        return null_ret

    

In [None]:
# Import data from S3 into pyspark dataframe
flist=[obj.key for obj in s3_rsc.Bucket(DATA_SOURCE_BUCKET_NAME).objects.all() if obj.key.startswith(DATA_SOURCE_PREFIX)]

df= pd.DataFrame(flist, columns=['raw_uri'])
ext_type=('pdf','docx','odt','odf', 'html')
df['document_format'] =  df.raw_uri.apply(lambda x: 'dir' if x.endswith('/') else x.split('.')[-1])

links = df[df.document_format=='xlsx'].raw_uri
dff = pd.DataFrame()
for lk in links:
    dff=pd.concat([dff, pd.read_excel(download_text(s3_cli, lk, DATA_SOURCE_BUCKET_NAME))])
dff.columns=['regulatory_topic','raw_uri']
dff['document_format']='html'
df = pd.concat([df,dff.drop('regulatory_topic', axis=1)])
df = df[df.document_format.isin(ext_type)].reset_index(drop=True)

In [None]:
DF = spark.createDataFrame(df.head())

static_md = [
('date_uploaded', datetime.now().isoformat()),
    ('status','published'),
    ('user_id','bulk_uploader'),
    ('version', 1)
]
for k,v in static_md:
    DF = DF.withColumn(k, F.lit(v))
    
DF.show()


In [None]:
out = DF.rdd.map(lambda x: extract_data(x['raw_uri'], x['document_format'], nlp))

df2=out.toDF(schema=md_schema)

dff = DF.join(df2, on='raw_uri', how='inner')
# dff.show()

In [None]:
dff.write.mode('overwrite').parquet(PROCESSED_METADATA_BUCKET)