In [None]:
DATA_SOURCE_PREFIX='temp'
DATA_SOURCE_BUCKET_NAME='beis-orp-dev-datalake'
LEGISLATION_DATA_PATH='legislative-origin/legislation_data_2023_03_12.csv'
DTI_RULEBOOK='dti/doc_type_rules_v.2.jsonl'

In [None]:
import boto3
s3=boto3.resource('s3')
flist=[obj.key for obj in s3.Bucket(DATA_SOURCE_BUCKET_NAME).objects.all() if obj.key.startswith(DATA_SOURCE_PREFIX)]

In [None]:
import pandas as pd
import spacy
import json
from pdf_to_text.pdf_to_text import pdf_converter
from odf_to_text.odf_to_text import odf_converter
from docx_to_text.docx_to_text import docx_converter
from html_to_text.html_to_text import html_converter
from date_generation.date_generation import date_generation
from legislative_origin.lo_extraction import lo_extraction
from title_generation.title_generation import title_generator
from document_type_identification.rule_based_dti import dti
from keyword_extraction.keyword_extraction import  keyword_extraction

import io
doc_format_map = {
    'pdf': pdf_converter,
    'odf': odf_converter,
    'docx': docx_converter,
    'doc': docx_converter,
    'html': html_converter
}
def download_text(s3_client, object_key, source_bucket):
        '''Downloads the PDF from S3 ready for conversion and metadata extraction'''

        document = s3_client.get_object(
            Bucket=source_bucket,
            Key=object_key
        )['Body'].read()

        doc_bytes_io = io.BytesIO(document)
        return doc_bytes_io


def lo_det(text):
    leg_titles = pd.read_csv(download_text(s3, LEGISLATION_DATA_PATH, DATA_SOURCE_BUCKET_NAME))
    leg_titles = leg_titles[leg_titles.legType.isin(['Primary', 'Secondary'])]
    return lo_extraction(text, leg_titles)

def extract_data_stage1(s3, uri, doc_format):
    btext = uri if doc_format=='html' else download_text(s3, uri, DATA_SOURCE_BUCKET_NAME)
    text, title, date_published = doc_format_map[doc_format](btext)
    return text, title, date_published

def extract_data_stage2(text, title, dp, nlp):
    ntitle = title_generator(text, title)
    ndp = date_generation(text, dp)
    los = lo_det(text)
    keywords = keyword_extraction(text, title)
    document_type = dti(text, ntitle, nlp)
    # summary TODO 
    summary = None
    
    return ntitle, ndp, los, keywords, document_type, summary
    

In [None]:

df= pd.DataFrame(flist, columns=['uri'])
ext_type=('pdf','doc','docx','odt','odf', 'html')
df['document_format'] =  df.uri.apply(lambda x: 'dir' if x.endswith('/') else x.split('.')[-1])


In [None]:
links = df[df.document_format=='xlsx'].uri
dff = pd.DataFrame()
s3 = boto3.client('s3')
for lk in links:
    dff=pd.concat([dff, pd.read_excel(download_text(s3, lk, DATA_SOURCE_BUCKET_NAME))])
dff.columns=['regulatory_topic','uri']
dff['document_format']='html'
df = pd.concat([df,dff])
df = df[df.document_format.isin(ext_type)].reset_index(drop=True)


In [None]:
rule_json = s3.get_object(Bucket=DATA_SOURCE_BUCKET_NAME, Key=DTI_RULEBOOK)['Body'].read().decode('utf-8')
dti_patterns =[json.loads(line) for line in rule_json.split('\n') if line.strip()]
nlp = spacy.load("en_core_web_sm", exclude=['entity_ruler',  'ner'])
nlp.add_pipe("entity_ruler", config={'phrase_matcher_attr':'LOWER'}).add_patterns(dti_patterns)

In [None]:
s3=boto3.client('s3')
x = df.sample().iloc[0]
x

In [None]:
text, title, dp = extract_data_stage1(s3, x.uri, x.document_format)

In [None]:
extract_data_stage2(text, title, dp, nlp)