In [1]:
import json
import numpy as np
import pandas as pd
import psycopg2

%load_ext jupyternotify

<IPython.core.display.Javascript object>

# How many doctors on Medicaid drug utilization boards have received payments from drug manufacturers?

We will use Open Payments data to determine the extent of payments from August 2013 through 2017 from drug manufacturers to doctors serving on Medicaid drug utilization boards between the start of 2016 and March of 2018.

## Create tables for analysis

First, connect to the database.

In [2]:
with open("config.json") as f:
    conf = json.load(f)

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(conf["host"], conf["database"], conf["user"], conf["password"])

In [4]:
conn = psycopg2.connect(conn_str)
conn.autocommit = True # Allow the notebook to commit transactions (like creating a table) to the connected database.

Query the database to create a new table of all Medicaid drug utilization board members and the general purpose payments they received from drug manufacturers.

In [5]:
%%notify
payments_by_doctor = pd.read_sql("""CREATE TABLE IF NOT EXISTS payments_by_doctor AS
SELECT physician_profile_id1 AS physician_profile_id,
       first_name,
       last_name,
       state_name,
       degree_1,
       degree_2,
       payments_amount,
       number_of_payments
FROM
  (SELECT physician_profile_id AS physician_profile_id1,
          first_name,
          last_name,
          state_name,
          degree_1,
          degree_2
   FROM committee_members
   LEFT JOIN open_payments_general ON first_name = physician_first_name
   AND last_name = physician_last_name
   AND state_name = recipient_state
   GROUP BY physician_profile_id,
            first_name,
            last_name,
            state_name,
            degree_1,
            degree_2) AS matching_committee_members
LEFT JOIN
  (SELECT physician_profile_id AS physician_profile_id2,
          sum(total_amount_of_payment_usdollars) AS payments_amount,
          count(DISTINCT record_id) AS number_of_payments
   FROM open_payments_general
   GROUP BY physician_profile_id) AS totals_by_id ON matching_committee_members.physician_profile_id1 = totals_by_id.physician_profile_id2;

GRANT ALL ON TABLE payments_by_doctor TO redash;


SELECT *
FROM payments_by_doctor;""", con=conn)

<IPython.core.display.Javascript object>

The only consistent data we have about members serving on state drug utilization boards are the members' first name, last name and state. As such, that’s all we have to join against the Open Payments data. So in order to understand the extent of possible duplicates in this data, we will count how many distinct doctors in both the Open Payments general payments table and the [NPPES](https://npiregistry.cms.hhs.gov/) (a national dataset of physicians) registry table match our committee members.

In [6]:
%%notify
payments_by_doctor_counts = pd.read_sql("""
CREATE TABLE IF NOT EXISTS payments_by_doctor_counts AS
SELECT payments_by_doctor.physician_profile_id,
       payments_by_doctor.first_name,
       payments_by_doctor.last_name,
       payments_by_doctor.state_name,
       payments_by_doctor.degree_1,
       payments_by_doctor.degree_2,
       payments_amount,
       number_of_payments,
       doc_counts_nppes,
       doc_counts_open_payments
FROM payments_by_doctor
LEFT JOIN
  (SELECT provider_first_name,
          provider_last_name_legal_name,
          provider_business_practice_location_address_state_name,
          count(DISTINCT npi) AS doc_counts_nppes
   FROM nppes
   GROUP BY provider_first_name,
            provider_last_name_legal_name,
            provider_business_practice_location_address_state_name) AS nppes_grouped_by_first_last_state ON payments_by_doctor.first_name = nppes_grouped_by_first_last_state.provider_first_name
AND payments_by_doctor.last_name = nppes_grouped_by_first_last_state.provider_last_name_legal_name
AND payments_by_doctor.state_name = nppes_grouped_by_first_last_state.provider_business_practice_location_address_state_name
LEFT JOIN
  (SELECT physician_first_name,
          physician_last_name,
          recipient_state,
          count(DISTINCT physician_profile_id) AS doc_counts_open_payments
   FROM open_payments_general
   GROUP BY physician_first_name,
            physician_last_name,
            recipient_state) AS open_payments_grouped_by_first_last_state ON payments_by_doctor.first_name = open_payments_grouped_by_first_last_state.physician_first_name
AND payments_by_doctor.last_name = open_payments_grouped_by_first_last_state.physician_last_name
AND payments_by_doctor.state_name = open_payments_grouped_by_first_last_state.recipient_state;


GRANT ALL ON TABLE payments_by_doctor_counts TO redash;


SELECT *
FROM payments_by_doctor_counts;""", con=conn)

<IPython.core.display.Javascript object>

OK. So, we know we have some committee member doctors with the same name as other doctors in their state. Now let's find out the range of payments those doctors may have taken so we can get a sense as to how much uncertainty there is in our data vis-a-vis the prevalence of committee member doctors taking drug company money.

In [7]:
%%notify
payments_by_doctor_max_min = pd.read_sql("""
CREATE TABLE IF NOT EXISTS payments_by_doctor_max_min AS
SELECT physician_profile_id,
       first_name,
       last_name,
       state_name,
       degree_1,
       degree_2,
       payments_amount,
       max_amount,
       min_amount,
       number_of_payments,
       doc_counts_nppes,
       doc_counts_open_payments
FROM payments_by_doctor_counts
LEFT JOIN
  (SELECT first_name AS first_name_1,
          last_name AS last_name_1,
          state_name AS state_name_1,
          max(payments_amount) AS max_amount,
          min(payments_amount) AS min_amount
   FROM payments_by_doctor
   GROUP BY first_name_1,
            last_name_1,
            state_name_1) AS open_payments_grouped_by_first_last_state ON first_name = open_payments_grouped_by_first_last_state.first_name_1
AND last_name = open_payments_grouped_by_first_last_state.last_name_1
AND state_name = open_payments_grouped_by_first_last_state.state_name_1;

GRANT ALL ON TABLE payments_by_doctor_max_min TO redash;


SELECT *
FROM payments_by_doctor_max_min;""", con=conn)

<IPython.core.display.Javascript object>

When we joined the committee members table to the Open Payments table, our query returned multiple doctors with the same name and state in instances where multiple doctors were recorded as accepting payments. In order to avoid overstating the amount of payments these committee members took, we need to return a table of board members and the payments they received that eliminates duplicates.

In [8]:
%%notify
deduplicated_payments_by_doctor = pd.read_sql("""
CREATE TABLE IF NOT EXISTS deduplicated_payments_by_doctor AS
SELECT *
FROM
  (SELECT DISTINCT ON (first_name,
                       last_name,
                       state_name) *
   FROM payments_by_doctor_max_min) AS grouped_by_first_last_state;

GRANT ALL ON TABLE deduplicated_payments_by_doctor TO redash;


SELECT *
FROM deduplicated_payments_by_doctor;""", con=conn)

<IPython.core.display.Javascript object>

And finally, let's play it safe and remove those board member doctors who might have another doctor in the same state with the same name. To do so, we'll return a table that only includes doctors a) without possible duplicates, b) with more records in the Open Payments table than in the NPPES table or c) appearing only in the Open Payments table and not in the NPPES table.

In [9]:
%%notify
duplicates_removed = pd.read_sql("""
CREATE TABLE IF NOT EXISTS duplicates_removed AS
SELECT *
FROM deduplicated_payments_by_doctor
WHERE doc_counts_nppes = doc_counts_open_payments
  AND doc_counts_nppes = 1
  OR doc_counts_nppes < doc_counts_open_payments
  OR doc_counts_nppes IS NULL
  AND doc_counts_open_payments = 1;

GRANT ALL ON TABLE duplicates_removed TO redash;


SELECT *
FROM duplicates_removed;""", con=conn)

<IPython.core.display.Javascript object>

## Analyze the data

How much money from drug companies have doctors sitting on drug utilization boards received? We will use the minimum amount column so as to avoid overcounting payments in cases where more than one doctor with the same name in the same state is listed in Open Payments.

In [10]:
duplicates_removed["min_amount"].sum()

3036013.1100000013

OK. So doctors sitting on drug utilization boards have taken in about $3 million. And how many of these doctors are there?

In [11]:
duplicates_removed[duplicates_removed["min_amount"] > 0].count()

physician_profile_id        283
first_name                  283
last_name                   283
state_name                  283
degree_1                    274
degree_2                     29
payments_amount             283
max_amount                  283
min_amount                  283
number_of_payments          283
doc_counts_nppes            274
doc_counts_open_payments    283
dtype: int64

OK. So, 283 doctors on drug utilization boards have taken at least some money from drug companies. What amount of all doctors sitting on these boards does that figure represent? To figure that out, we'll need to first calculate all the covered physicians sitting on the boards, as determined by their degree(s).

In [12]:
covered_degrees = ["MD", "DO", "DMD", "DDS", "DPM", "OD", "DC"]
covered_degrees

['MD', 'DO', 'DMD', 'DDS', 'DPM', 'OD', 'DC']

So how many doctors sitting on these boards either held one of these degrees or took money from drug companies (and were thus by definition covered by Open Payments)? That's our denominator.

In [13]:
deduplicated_payments_by_doctor[(deduplicated_payments_by_doctor["degree_1"].isin(covered_degrees)) | (deduplicated_payments_by_doctor["degree_2"].isin(covered_degrees)) | (deduplicated_payments_by_doctor["min_amount"] > 0)].count()

physician_profile_id        313
first_name                  455
last_name                   455
state_name                  455
degree_1                    445
degree_2                     44
payments_amount             313
max_amount                  313
min_amount                  313
number_of_payments          313
doc_counts_nppes            377
doc_counts_open_payments    313
dtype: int64

OK. So, it looks like 455 doctors either held one of these covered degrees or took money from drug companies (and were thus by definition covered by Open Payments). It's possible some of the doctors for whom we lack degree information are covered by Open Payments but we just don't know. (It's also possible that some of the doctors we excluded from our count of doctors taking drug company money because there was at least one possible duplicate who didn't take money did, in fact, take money.) In any case, using the 455 figure as our denominator, 62 percent of covered doctors sitting on these boards took some amount of money from drug companies.

How many received more than $1,000?

In [14]:
duplicates_removed[duplicates_removed["min_amount"] > 1000].count()

physician_profile_id        103
first_name                  103
last_name                   103
state_name                  103
degree_1                     99
degree_2                      9
payments_amount             103
max_amount                  103
min_amount                  103
number_of_payments          103
doc_counts_nppes             99
doc_counts_open_payments    103
dtype: int64

OK. So, 103 doctors received more than $1,000.

How many received more than $10,000?

In [15]:
duplicates_removed[duplicates_removed["min_amount"] > 10000].count()

physician_profile_id        34
first_name                  34
last_name                   34
state_name                  34
degree_1                    31
degree_2                     3
payments_amount             34
max_amount                  34
min_amount                  34
number_of_payments          34
doc_counts_nppes            33
doc_counts_open_payments    34
dtype: int64

OK. So, 34 doctors received more than $10,000.

Now let's break a couple of these figures down by state. First up: how many states had doctors on such a board who received money from drug companies?

In [16]:
# Calculate the number of doctors receiving money per state and the amount they took.
receiving_docs_by_state = duplicates_removed.groupby("state_name").agg({"min_amount": [sum, max], "physician_profile_id": len})
receiving_docs_by_state.columns = receiving_docs_by_state.columns.droplevel() # Drop the first level of the multi-level column names
receiving_docs_by_state.reset_index(inplace=True)
receiving_docs_by_state.rename(columns={"sum": "total_payments", "max": "max_payment", "len": "doctors_paid"}, inplace=True)

# Calculate the total number of doctors per state.
total_docs_by_state = deduplicated_payments_by_doctor[(deduplicated_payments_by_doctor["degree_1"].isin(covered_degrees)) | (deduplicated_payments_by_doctor["degree_2"].isin(covered_degrees)) | (deduplicated_payments_by_doctor["min_amount"] > 0)].groupby("state_name").agg({"physician_profile_id": len})
total_docs_by_state.reset_index(inplace=True)
total_docs_by_state.rename(columns={"physician_profile_id": "total_doctors"}, inplace=True)

# Merge the dataframes.
payments_by_state = total_docs_by_state.merge(receiving_docs_by_state, how="outer", on="state_name")
payments_by_state[payments_by_state["total_payments"] > 0].count()

state_name        48
total_doctors     48
total_payments    48
max_payment       48
doctors_paid      48
dtype: int64

OK. So it looks like 48 states had doctors on such boards who received money.

And how many states had at least one doctor on such a board who received more than $1,000?

In [17]:
payments_by_state[payments_by_state["max_payment"] > 1000].count()

state_name        38
total_doctors     38
total_payments    38
max_payment       38
doctors_paid      38
dtype: int64

OK. It looks like 38 states had at least one doctor on such a board who received more than $1,000.

## Export the data to Excel for visualization and further analysis

In [18]:
writer = pd.ExcelWriter("data/doc_payments.xlsx")
duplicates_removed.to_excel(writer, "Payments by Doctor", startcol=0, index=False)
payments_by_state.to_excel(writer, "Payments by State", startcol=0, index=False)
writer.save()