In [1]:
import datetime
import itertools
import random

import names
import numpy as np
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import create_engine
from pandas.io.sql import SQLTable


def _execute_insert(self, conn, keys, data_iter):
    """Optional, but useful: helps Pandas write tables against Postgres much faster.
    See https://github.com/pydata/pandas/issues/8953 for more info
    """
    print("Using monkey-patched _execute_insert")
    data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
    conn.execute(self.insert_statement().values(data))

SQLTable._execute_insert = _execute_insert


connection_string = 'postgres://localhost:5432/VincentLa'
engine = create_engine(connection_string)

SCHEMA_NAME = 'tutorial_data_ingest'
engine.execute('CREATE SCHEMA IF NOT EXISTS ' + SCHEMA_NAME)

<sqlalchemy.engine.result.ResultProxy at 0x108cd17f0>

# Generate Data for Anomaly Detection Example

This section generates the data for a case study that we will present. In particular, we will generate some claims data to look at. Next, need to alter how we generate random procedure codes so that some doctors really are more likely to upcode.

In [2]:
def draw_random_int():
    """Draw a random integer from a random normal distribution"""
    i = np.random.uniform()
    if i <= 0.1:
        number = np.random.normal(10, 5)
    else:
        number = np.random.normal(200, 60)   
        
    if number < 1:
        return 1
    else:
        return int(round(number))
    

In [3]:
def draw_distribution_codes(spec, lvl1, lvl2, lvl3, lvl4, lvl5):
    """Given a probability distribution for E/M codes, returns a similar, but slightly
    
    adjusted probability distribution to introduce randomness and flexibility.
    """
    if spec == 'Cardiologist':
        lvl5 += 0.2
    elif spec == 'Infectious Disease':
        lvl5 += 0.1
    elif spec == 'Other Specialty':
        lvl5 += 0.05
    
    distrib = [lvl1, lvl2, lvl3, lvl4, lvl5]
    new_distrib = []
    for l in distrib:
        number = np.random.normal(l, l / 4)
        if number < 0:
            number = 0
        elif number > 1:
            number = 1
        else:
            number = number
        new_distrib.append(number)
    return [i / sum(new_distrib) for i in new_distrib]

In [4]:
def draw_specialty():
    """Draw specialty given some index"""
    x = np.random.uniform()
    if x <= 0.6:
        return 'Primary Care'
    elif x <= 0.75:
        return 'Cardiologist'
    elif x <= 0.85:
        return 'Infectious Disease'
    else:
        return 'Other Specialty'

In [5]:
num_doctors = 1500

upcoders = {
    'lvl1': 0.025,
    'lvl2': 0.025,
    'lvl3': 0.20,
    'lvl4': 0.45,
    'lvl5': 0.30
}

typicalcoders = {
    'lvl1': 0.025,
    'lvl2': 0.075,
    'lvl3': 0.4,
    'lvl4': 0.4,
    'lvl5': 0.1
}
highupcoders = {
    'lvl1': 0.025,
    'lvl2': 0.025,
    'lvl3': 0.025,
    'lvl4': 0.025,
    'lvl5': 0.9
}

# Create list of doctor names
# Create number of encounters for each doctor
# Create personid for each encounter
drs = ['Dr. ' + names.get_full_name() for i in range(0, num_doctors)]
specialties = [draw_specialty() for i in range(0, num_doctors)]
num_of_encounters = [draw_random_int() for i in range(0, num_doctors)]
personid = [random.randint(1, 25000) for i in range(0, sum(num_of_encounters))]
procedure_map = {
    1: 'E/M, Lowest Intensity',
    2: 'E/M, Second Lowest Intensity',
    3: 'E/M, Medium Intensity',
    4: 'E/M, High Intensity',
    5: 'E/M, Highest Intensity',
}

paid_map = {
    1: 30,
    2: 50,
    3: 80,
    4: 110,
    5: 230,
}

doctors = list(itertools.chain(*[[drs[i]] * num_of_encounters[i] for i in range(0, len(num_of_encounters))]))
dr_specialties = list(itertools.chain(*[[specialties[i]] * num_of_encounters[i] for i in range(0, len(num_of_encounters))]))

d = {
    'servicing_provider_npi': doctors,
    'specialty': dr_specialties,
    'personid': personid,
}
df = pd.DataFrame(d)

procedure_codes = []
for i in df.index:
    spec = df.loc[i, 'specialty']
    if drs.index(df.loc[i, 'servicing_provider_npi']) % 100 < 20:
        procedure_codes.append(
            np.random.choice(np.arange(1, 6), p=draw_distribution_codes(spec, **upcoders)))
    elif drs.index(df.loc[i, 'servicing_provider_npi']) % 100 <= 24:
        procedure_codes.append(
            np.random.choice(np.arange(1, 6), p=draw_distribution_codes(spec, **highupcoders)))
    else:
        procedure_codes.append(
            np.random.choice(np.arange(1, 6), p=draw_distribution_codes(spec, **typicalcoders)))
procedure_codes = np.asarray(procedure_codes)

df['procedure_code'] = procedure_codes
df['procedure_name'] = df['procedure_code'].map(procedure_map)
df['paid_amount'] = df['procedure_code'].map(paid_map)

In [6]:
df.head()

Unnamed: 0,personid,servicing_provider_npi,specialty,procedure_code,procedure_name,paid_amount
0,2673,Dr. Marlene Ahlstrom,Other Specialty,5,"E/M, Highest Intensity",230
1,7567,Dr. Marlene Ahlstrom,Other Specialty,3,"E/M, Medium Intensity",80
2,9510,Dr. Marlene Ahlstrom,Other Specialty,3,"E/M, Medium Intensity",80
3,17106,Dr. Marlene Ahlstrom,Other Specialty,3,"E/M, Medium Intensity",80
4,24499,Dr. Marlene Ahlstrom,Other Specialty,5,"E/M, Highest Intensity",230


In [7]:
df.to_sql('claim_lines', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')

Using monkey-patched _execute_insert
