# Import

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import reimport plotly.express as px
from jupyterthemes import jtplot
jtplot.style()

# Read in full_refs_hosp as refs

In [3]:
refs = pd.read_csv('../data/neo4j/full_refs_hosp.csv') 
refs.head()

Unnamed: 0,from_npi,referral_id,entity_type_code_provider,name,taxonomy_code_provider,classification_provider,specialization_provider,organization_provider,address_line1_provider,address_line2_provider,...,address_line1_hospital,address_line2_hospital,city_hospital,state_hospital,zip_hospital,address_hospital,patient_count,transaction_count,average_day_wait,std_day_wait
0,1790730448,13418843,1.0,"STAFFORD, JAMES",2085R0202X,Radiology,Diagnostic Radiology,,210 25TH AVE N STE 1204,,...,313 N MAIN ST,,ASHLAND CITY,TN,37015,"313 N MAIN ST, ASHLAND CITY, TN 37015",65,67,24.552,50.253
1,1790762219,13418844,1.0,"GOODIN, ELLIS",2085R0202X,Radiology,Diagnostic Radiology,,210 25TH AVE N STE 1204,,...,313 N MAIN ST,,ASHLAND CITY,TN,37015,"313 N MAIN ST, ASHLAND CITY, TN 37015",61,63,17.238,35.569
2,1801017116,13418846,1.0,"STEWART, BRANDY",363LF0000X,Nurse Practitioner,Family,,313 N MAIN ST,,...,313 N MAIN ST,,ASHLAND CITY,TN,37015,"313 N MAIN ST, ASHLAND CITY, TN 37015",104,186,19.129,51.613
3,1811077712,17417315,1.0,"SHOEMAKER, BENJAMIN",207Q00000X,Family Medicine,,,302 N MAIN ST,,...,313 N MAIN ST,,ASHLAND CITY,TN,37015,"313 N MAIN ST, ASHLAND CITY, TN 37015",50,90,17.611,47.253
4,1821060526,19402345,1.0,"WATERS, RONALD",2085R0202X,Radiology,Diagnostic Radiology,,210 25TH AVE N STE 1204,,...,313 N MAIN ST,,ASHLAND CITY,TN,37015,"313 N MAIN ST, ASHLAND CITY, TN 37015",62,66,12.697,29.35


# Summarize by classification and specialization

## overview of top 20 taxonomies by referral patient count

In [51]:
(refs.groupby(['taxonomy_code_provider', 'classification_provider', 'specialization_provider'])
     ['patient_count']
     .sum()
     .sort_values(ascending = False)
     .nlargest(20)
)

taxonomy_code_provider  classification_provider  specialization_provider                         
2085R0202X              Radiology                Diagnostic Radiology                                448885
207RC0000X              Internal Medicine        Cardiovascular Disease                              154245
363LF0000X              Nurse Practitioner       Family                                               61071
207ZP0102X              Pathology                Anatomic Pathology & Clinical Pathology              57185
207RP1001X              Internal Medicine        Pulmonary Disease                                    36691
207RN0300X              Internal Medicine        Nephrology                                           33908
207RG0100X              Internal Medicine        Gastroenterology                                     27214
207RI0011X              Internal Medicine        Interventional Cardiology                            25750
363LA2100X              Nurse Practiti

## make a function to prepare a dataframe showing highest number of potential patients: make_not_to_vandy
The output dataframe will also contain a useful column showing what percentage of patients are being referred somewhere other than Vanderbilt, which will help gauge growth potential.

In [67]:
def make_not_to_vandy(df, group):

    # create a column that labels whether the recommendation went to Vanderbilt or not
    df['to_vandy'] = df['organization_hospital'].str.contains("VANDERBILT")

    # for each specialty, determine the count of patients recommended to and not recommended to Vanderbilt
    to_vandy = (df.groupby(group + ['to_vandy'])
         ['patient_count']
         .sum()
         .reset_index()
    )

    # add a column for overall count of referrals by specialty, then determine proportion
    # of referrals to Vanderbilt vs proportion not to Vanderbilt
    to_vandy['patient_count_overall'] = (to_vandy.groupby(group)
                                     ['patient_count']
                                     .transform(sum)
                                    )
    to_vandy['patient_prop'] = (to_vandy['patient_count']/
                                     to_vandy['patient_count_overall']
                                    )

    # filter to focus just on referrals not to Vanderbilt
    not_to_vandy = to_vandy[to_vandy['to_vandy'] == False]
    
    return not_to_vandy.sort_values('patient_count', ascending = False)

## results by classification and specialization

Far and away the most potential is in Diagnostic Radiology, where about 81.5% of patients are being referred somewhere other than Vanderbilt, comprising 365,807 potential patients. 

Note that Internal Medicine specializations comprise 12 of the top 25; however, the proportions of patients not going to Vanderbilt vary greatly by specialization, so our recommendation would be to focus on specific specializations and not just the Internal Medicine classification as a whole.

The top 5 could reasonably be divided into four tiers; however, they still stand apart from the rest as specializations to target for volume.

    - Radiology: Diagnostic Radiology
    - Internal Medicine: Cardiovascular Disease
    - Nurse Practitioner: Family
    - Pathology: Anatomic Pathology & Clinical Pathology
    - Internal Medicine: Pulmonary Disease

In [73]:
spec_not_to_vandy = make_not_to_vandy(refs, ['taxonomy_code_provider', 'classification_provider', 'specialization_provider'])

# get the top 25 potential patients to be gained
spec_not_to_vandy.nlargest(25, 'patient_count')

Unnamed: 0,taxonomy_code_provider,classification_provider,specialization_provider,to_vandy,patient_count,patient_count_overall,patient_prop
152,2085R0202X,Radiology,Diagnostic Radiology,False,365807,448885,0.814924
39,207RC0000X,Internal Medicine,Cardiovascular Disease,False,93743,154245,0.607754
195,363LF0000X,Nurse Practitioner,Family,False,45689,61071,0.748129
107,207ZP0102X,Pathology,Anatomic Pathology & Clinical Pathology,False,45602,57185,0.797447
65,207RP1001X,Internal Medicine,Pulmonary Disease,False,30967,36691,0.843994
63,207RN0300X,Internal Medicine,Nephrology,False,23863,33908,0.703757
59,207RI0011X,Internal Medicine,Interventional Cardiology,False,23259,25750,0.903262
47,207RG0100X,Internal Medicine,Gastroenterology,False,19345,27214,0.710847
41,207RC0001X,Internal Medicine,Clinical Cardiac Electrophysiology,False,14947,18227,0.820047
129,2084N0400X,Psychiatry & Neurology,Neurology,False,13275,22908,0.579492


These results are not dramatically different, but this list is filtered to only include specializations referring to Vanderbilt for less than half the time.

One specialization of interest that creeps into the top 25 is Emergency Medical Services. Here Vanderbilt is the referent less than 5% of the time.

Vanderbilt is never referred to for Orthopedic Surgery of the Spine.

In [74]:
# to ensure there's room to grow, filter only to specialties where at least 50% of patients
# are being referred somewhere other than Vanderbilt
spec_not_to_vandy[spec_not_to_vandy['patient_prop'] >= 0.5].nlargest(25, 'patient_count')

Unnamed: 0,taxonomy_code_provider,classification_provider,specialization_provider,to_vandy,patient_count,patient_count_overall,patient_prop
152,2085R0202X,Radiology,Diagnostic Radiology,False,365807,448885,0.814924
39,207RC0000X,Internal Medicine,Cardiovascular Disease,False,93743,154245,0.607754
195,363LF0000X,Nurse Practitioner,Family,False,45689,61071,0.748129
107,207ZP0102X,Pathology,Anatomic Pathology & Clinical Pathology,False,45602,57185,0.797447
65,207RP1001X,Internal Medicine,Pulmonary Disease,False,30967,36691,0.843994
63,207RN0300X,Internal Medicine,Nephrology,False,23863,33908,0.703757
59,207RI0011X,Internal Medicine,Interventional Cardiology,False,23259,25750,0.903262
47,207RG0100X,Internal Medicine,Gastroenterology,False,19345,27214,0.710847
41,207RC0001X,Internal Medicine,Clinical Cardiac Electrophysiology,False,14947,18227,0.820047
129,2084N0400X,Psychiatry & Neurology,Neurology,False,13275,22908,0.579492


Here is an ordered list of all specializations for which Vanderbilt is never the referent. 

Orthodpedic Surgery of the Spine is in a class of its own with regard to patient count.

In [76]:
# look at all specialties where all patients are referred somewhere other than Vanderbilt
spec_not_to_vandy[spec_not_to_vandy['patient_prop'] == 1].nlargest(25, 'patient_count')

Unnamed: 0,taxonomy_code_provider,classification_provider,specialization_provider,to_vandy,patient_count,patient_count_overall,patient_prop
91,207XS0117X,Orthopaedic Surgery,Orthopaedic Surgery of the Spine,False,3139,3139,1.0
144,2085D0003X,Radiology,Diagnostic Neuroimaging,False,497,497,1.0
184,261QP2300X,Clinic/Center,Primary Care,False,311,311,1.0
98,207YX0007X,Otolaryngology,Plastic Surgery within the Head & Neck,False,294,294,1.0
154,2085R0203X,Radiology,Therapeutic Radiology,False,218,218,1.0
212,364SW0102X,Clinical Nurse Specialist,Women's Health,False,206,206,1.0
160,2086S0105X,Surgery,Surgery of the Hand,False,194,194,1.0
115,2080P0207X,Pediatrics,Pediatric Hematology-Oncology,False,139,139,1.0
85,207WX0200X,Ophthalmology,Ophthalmic Plastic and Reconstructive Surgery,False,131,131,1.0
24,207PP0204X,Emergency Medicine,Pediatric Emergency Medicine,False,107,107,1.0


## results by provider

Derek Welch, a pathologist who works for Tristar (HCA) is by far the provider with the most potential patients for Vanderbilt. 

With a nearly a third of the volume, the runner up is also an HCA doctor, John Riddick. 

Strikingly, the rest of the top 25 are Diagnostic Radiology doctors.

In [71]:
prov_not_to_vandy = make_not_to_vandy(refs, ['from_npi', 'name', 'classification_provider', 'specialization_provider'])
prov_not_to_vandy.nlargest(25, 'patient_count')

Unnamed: 0,from_npi,name,classification_provider,specialization_provider,to_vandy,patient_count,patient_count_overall,patient_prop
238,1093753303,"WELCH, DEREK",Pathology,Anatomic Pathology & Clinical Pathology,False,19165,20018,0.957388
1232,1417131715,"RIDDICK, JOHN",Internal Medicine,Interventional Cardiology,False,6360,6360,1.0
120,1043232879,"GORDON, JONATHAN",Radiology,Diagnostic Radiology,False,5305,5817,0.911982
2799,1912984758,"LASSITER, GREGORY",Radiology,Diagnostic Radiology,False,5216,5216,1.0
2654,1871548818,"KLEIN, WILLIAM",Radiology,Diagnostic Radiology,False,5184,5184,1.0
366,1134321235,"PARIKH, VIRAJ",Radiology,Diagnostic Radiology,False,4914,5063,0.970571
901,1306993282,"SHIPMAN, JASON",Radiology,Diagnostic Radiology,False,4737,4737,1.0
1097,1376756742,"GRIFFIN, BENJAMIN",Radiology,Diagnostic Radiology,False,4679,4679,1.0
2517,1821060526,"WATERS, RONALD",Radiology,Diagnostic Radiology,False,4215,4215,1.0
1328,1447234141,"CAPLAN, STUART",Radiology,Diagnostic Radiology,False,4033,4033,1.0
