# Hospital Patient Claims
## Raw to Staging
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


In [None]:
%idle_timeout 10
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
import json 
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as F
from pyspark.sql.types import * 
from awsglue.dynamicframe import DynamicFrame
import random 
from datetime import datetime
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
def get_secret():
    secret_name = "dev/hospital_patients_claims/redshift_connection"
    region_name = "us-east-1"
    client = boto3.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    db_config = get_secret_value_response['SecretString']
    return db_config

db_config = json.loads(get_secret())
my_conn_options = {
    "url": db_config['dev_url'],
    "user": db_config['dev_username'],
    "password": db_config['dev_password'],
    "redshiftTmpDir": db_config['dev_redshift_temp_directory'],
}

In [None]:
todays_date = datetime.today().strftime('%Y-%m-%d')

In [None]:
s3_path = f's3://hospital-patients-claims-bucket/raw_zone/raw_hospital_patients_claims/load_date={todays_date}'

In [None]:
df = glueContext.create_data_frame.from_options(connection_type='s3', connection_options={"paths": [s3_path]}, format='parquet')
df = df.cache()

In [None]:
# trimming the Columns along with DataType defining
df = df.withColumn('partition_key', F.trim(F.col('partition_key').cast(StringType()))) \
              .withColumn('patient_id', F.trim(F.col('patient_id').cast(StringType()))) \
              .withColumn('name_prefix', F.trim(F.col('name_prefix').cast(StringType()))) \
              .withColumn('first_name', F.trim(F.col('first_name').cast(StringType()))) \
              .withColumn('last_name', F.trim(F.col('last_name').cast(StringType()))) \
              .withColumn('date_of_birth', F.trim(F.col('date_of_birth').cast(StringType()))) \
              .withColumn('phone_number', F.trim(F.col('phone_number').cast(StringType()))) \
              .withColumn('email_id', F.trim(F.col('email_id').cast(StringType()))) \
              .withColumn('policy_id', F.trim(F.col('policy_id').cast(StringType()))) \
              .withColumn('policy_start_date', F.trim(F.col('policy_start_date').cast(StringType()))) \
              .withColumn('policy_end_date', F.trim(F.col('policy_end_date').cast(StringType()))) \
              .withColumn('preimum_amount', F.trim(F.col('preimum_amount').cast(StringType()))) \
              .withColumn('coverage_limit', F.trim(F.col('coverage_limit').cast(StringType()))) \
              .withColumn('address_id', F.trim(F.col('address_id').cast(StringType()))) \
              .withColumn('addressline', F.trim(F.col('addressline').cast(StringType()))) \
              .withColumn('borough', F.trim(F.col('borough').cast(StringType()))) \
              .withColumn('borough_level', F.trim(F.col('borough').cast(StringType()))) \
              .withColumn('borough_latitude', F.trim(F.col('borough_latitude').cast(StringType()))) \
              .withColumn('borough_longitude', F.trim(F.col('borough_longitude').cast(StringType()))) \
              .withColumn('city', F.trim(F.col('city').cast(StringType()))) \
              .withColumn('state', F.trim(F.col('state').cast(StringType()))) \
              .withColumn('claim_id', F.trim(F.col('claim_id').cast(StringType()))) \
              .withColumn('claim_initialized_date', F.trim(F.col('claim_initialized_date').cast(StringType()))) \
              .withColumn('claim_request_amount', F.trim(F.col('claim_request_amount').cast(StringType()))) \
              .withColumn('claim_rejected_reason', F.trim(F.col('claim_rejected_reason').cast(StringType()))) \
              .withColumn('source_file_path', F.trim(F.col('source_file_path').cast(StringType()))) \
              .withColumn('source_load_path', F.trim(F.col('source_load_path').cast(StringType()))) \
              .withColumn('load_timestamp', F.trim(F.col('load_timestamp').cast(StringType()))) 

In [None]:
# Filtering Unwanted Records
df = df.drop_duplicates()
# FIltering by NOT NULL Values on required ID Values
df = df.filter(F.col('patient_id').isNotNull() & F.col('policy_id').isNotNull() & F.col('address_id').isNotNull() & F.col('claim_id').isNotNull())


In [None]:
# Checking Amount Column to be having Only Numbers - No Negative - Amount Column Format enforce
df = df.withColumn('claim_request_amount', F.regexp_replace('claim_request_amount',r'\$',''))
df = df.withColumn('preimum_amount', F.regexp_replace('preimum_amount',r'\$',''))
df = df.withColumn('coverage_limit', F.regexp_replace('coverage_limit',r'\$',''))


In [None]:
# Filtering records where amounts to be greater than 0
df = df.where(df.claim_request_amount > 0)
df = df.where(df.preimum_amount > 0)
df = df.where(df.coverage_limit > 0)

In [None]:
# Calculate High Risk Claims and add it as a Flag Indicator 
df = df.withColumn('high_risk_claim_flag', F.when(((df.claim_request_amount / df.coverage_limit) * 100) > 80, 'Y').otherwise('N'))


In [None]:
# Rejected Claims with Rejected Reasons 
df = df.filter(~((df.claim_status == 'Rejected') & (df.claim_rejected_reason.isNull())))

In [None]:
# Data Enrichment of Borough Details Values
df_borough_details = spark.read.csv('../lookup_files/lookup_data_london_borough_details.csv', header=True)
combined_df = df.join(F.broadcast(df_borough_details), df.borough == df_borough_details.borough, 'left')

# Enrich DataFrame with borough abbrevation and code of London 
df = combined_df.select(df['*'], 'borough_abbrev','borough_code') 


In [None]:
# Process all date-related columns to a standard date/timestamp type
df = df.withColumn('date_of_birth', F.to_date(df.date_of_birth, 'dd-MM-yyyy'))
df = df.withColumn('policy_start_date', F.to_date(df.policy_start_date,'dd/MM/yyyy'))
df = df.withColumn('policy_end_date', F.to_date(df.policy_end_date, 'yyyy/MM/dd'))
df = df.withColumn('claim_initialized_date', F.to_date(df.policy_end_date, 'dd/MM/yyyy'))


# Policy End date should be greater than start date 
df = df.where(df.policy_end_date > df.policy_start_date)


In [None]:
# Process Latitude and Longitude of the borogh
df = df.withColumn('borough_latitude', F.regexp_extract('borough_latitude', r'[0-9]{1,3}.[0-9]{1,4}',0))
df = df.withColumn('borough_longitude', F.regexp_extract('borough_longitude', r'[0-9]{1,3}.[0-9]{1,4}',0))


In [None]:
# Checking Name Column to be having Only Letters - Replacing Invalid characters to EMPTY_STRING
df = df.withColumn('first_name', F.regexp_replace(F.col('first_name'), '[^a-zA-Z]', ''))
df = df.withColumn('last_name', F.regexp_replace(F.col('last_name'), '[^a-zA-Z]', ''))

In [None]:
# INITIAL Caps on Name Fields
df = df.withColumn('first_name', F.initcap(F.col('first_name')))
df = df.withColumn('last_name', F.initcap(F.col('last_name')))

In [None]:
# Generating Full Name for the patient
df = df.withColumn('patient_full_name', F.concat_ws(' ',df.name_prefix,df.first_name,df.last_name))

In [None]:
# Check email format - Filtering invalid Email ID records - Can be Test Records
df = df.filter(F.regexp_like(df.email_id, F.lit(r'^[a-z0-9_.+-]+@[a-z]+\.[a-z]+$')) == True )

In [None]:
# Check Ph Number - Filtering invalid Phone No records - Can be Test Records
df = df.filter(F.regexp_like('phone_number',F.lit(r"^[\+]44[0-9]{10}$")) == True) 


In [None]:
# Adding Load Date
df = df.withColumn('load_date',F.lit(todays_date))

In [None]:
# Writing the data to Redshift Table - Staging Claims
dyf = DynamicFrame.fromDF(df, glueContext, 'redshift_write_staging_claims')
my_conn_options['dbtable'] = "staging_claims_incremental_load"
glueContext.write_dynamic_frame.from_options(
    connection_type="redshift",
    connection_options = my_conn_options,
    frame = dyf,
)