In [1]:
import datetime
import itertools
import random

import names
import numpy as np
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import create_engine


connection_string = 'postgres://localhost:5432/VincentLa'
engine = create_engine(connection_string)

SCHEMA_NAME = 'tutorial_data_ingest'
engine.execute('CREATE SCHEMA IF NOT EXISTS ' + SCHEMA_NAME)

<sqlalchemy.engine.result.ResultProxy at 0x10e0ca2b0>

# Generate Data for Anomaly Detection Example

This section generates the data for a case study that we will present. In particular, we will generate some claims data to look at. Next, need to alter how we generate random procedure codes so that some doctors really are more likely to upcode.

In [2]:
def draw_random_int():
    """Draw a random integer from a random normal distribution"""
    number = np.random.normal(100, 60)
    if number < 1:
        return 1
    else:
        return int(round(number))

In [6]:
num_doctors = 1500

upcoders = [0.05, 0.1, 0.3, 0.3, 0.25]
typicalcoders = [0.025, 0.075, 0.4, 0.4, 0.1]
highupcoders = [0.075, 0.075, 0.075, 0.075, 0.7]

# Create list of doctor names
# Create number of encounters for each doctor
# Create personid for each encounter
drs = ['Dr. ' + names.get_full_name() for i in range(0, num_doctors)]
num_of_encounters = [draw_random_int() for i in range(0, num_doctors)]
personid = [random.randint(1, 25000) for i in range(0, sum(num_of_encounters))]
procedure_map = {
    1: 'E/M, Lowest Intensity',
    2: 'E/M, Second Lowest Intensity',
    3: 'E/M, Medium Intensity',
    4: 'E/M, High Intensity',
    5: 'E/M, Highest Intensity',
}

paid_map = {
    1: 30,
    2: 70,
    3: 100,
    4: 150,
    5: 190,
}

doctors = list(itertools.chain(*[[drs[i]] * num_of_encounters[i] for i in range(0, len(num_of_encounters))]))

d = {
    'servicing_provider_npi': doctors,
    'personid': personid,
}
df = pd.DataFrame(d)

procedure_codes = []
for i in df.index:
    if drs.index(df.loc[i, 'servicing_provider_npi']) % 100 < 20:
        procedure_codes.append(np.random.choice(np.arange(1, 6), p=upcoders))
    elif drs.index(df.loc[i, 'servicing_provider_npi']) % 100 == 20:
        procedure_codes.append(np.random.choice(np.arange(1, 6), p=highupcoders))
    else:
        procedure_codes.append(np.random.choice(np.arange(1, 6), p=typicalcoders))
procedure_codes = np.asarray(procedure_codes)

df['procedure_code'] = procedure_codes
df['procedure_name'] = df['procedure_code'].map(procedure_map)
df['paid_amount'] = df['procedure_code'].map(paid_map)

In [7]:
df.head()

Unnamed: 0,personid,servicing_provider_npi,procedure_code,procedure_name,paid_amount
0,14443,Dr. Robert Banks,5,"E/M, Highest Intensity",190
1,10983,Dr. Michael Ambrose,3,"E/M, Medium Intensity",100
2,10361,Dr. Michael Ambrose,1,"E/M, Lowest Intensity",30
3,8139,Dr. Michael Ambrose,5,"E/M, Highest Intensity",190
4,23252,Dr. Michael Ambrose,5,"E/M, Highest Intensity",190


In [5]:
df.to_sql('claim_lines', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')