In [1]:
import json
import numpy as np
import pandas as pd
import psycopg2
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)
pd.set_option("display.float_format", lambda x: "%.2f" % x) # Suppress scientific notation for float data type

## How many doctors on Medicaid drug utilization boards have received payments from drug manufacturers?

We will use Open Payments data to determine the extent of payments from 2013 through 2016 from drug manufacturers to doctors serving on Medicaid drug utilization boards between the start of 2016 and March of 2018.

First, connect to the database.

In [2]:
with open("config.json") as f:
    conf = json.load(f)

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(conf["host"], conf["database"], conf["user"], conf["password"])

In [4]:
conn = psycopg2.connect(conn_str)
conn.autocommit = True # Allow the notebook to commit transactions (like updating a column) to the connected database.

Query the database to return all doctors serving on Medicaid drug utilization boards and, if applicable, what general purpose payments they received from drug manufacturers.

In [5]:
general_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_general.change_type AS change_type,
       open_payments_general.physician_profile_id AS physician_id,
       open_payments_general.physician_first_name AS first,
       open_payments_general.physician_middle_name AS middle,
       open_payments_general.physician_last_name AS last,
       open_payments_general.recipient_city AS city,
       open_payments_general.recipient_state AS state_name,
       open_payments_general.physician_specialty AS specialty,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_general.total_amount_of_payment_usdollars AS amount,
       open_payments_general.date_of_payment AS payment_date,
       open_payments_general.form_of_payment_or_transfer_of_value AS payment_form,
       open_payments_general.nature_of_payment_or_transfer_of_value AS nature_of_payment,
       open_payments_general.record_id AS record_id,
       open_payments_general.dispute_status_for_publication AS dispute_status,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_1,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_1,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_2,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_2,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_3,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_3,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_4,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_4,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_5,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_5
FROM dur_committee_members
LEFT JOIN open_payments_general ON dur_committee_members.first_name = open_payments_general.physician_first_name
AND dur_committee_members.last_name = open_payments_general.physician_last_name
AND dur_committee_members.state_name = open_payments_general.recipient_state""",
                           con=conn)

DatabaseError: Execution failed on sql 'SELECT dur_committee_members.*,
       open_payments_general.change_type AS change_type,
       open_payments_general.physician_profile_id AS physician_id,
       open_payments_general.physician_first_name AS first,
       open_payments_general.physician_middle_name AS middle,
       open_payments_general.physician_last_name AS last,
       open_payments_general.recipient_city AS city,
       open_payments_general.recipient_state AS state_name,
       open_payments_general.physician_specialty AS specialty,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_general.total_amount_of_payment_usdollars AS amount,
       open_payments_general.date_of_payment AS payment_date,
       open_payments_general.form_of_payment_or_transfer_of_value AS payment_form,
       open_payments_general.nature_of_payment_or_transfer_of_value AS nature_of_payment,
       open_payments_general.record_id AS record_id,
       open_payments_general.dispute_status_for_publication AS dispute_status,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_1,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_1,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_2,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_2,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_3,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_3,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_4,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_4,
       open_payments_general.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_5,
       open_payments_general.associated_drug_or_biological_ndc_1 AS ndc_5
FROM dur_committee_members
LEFT JOIN open_payments_general ON dur_committee_members.first_name = open_payments_general.physician_first_name
AND dur_committee_members.last_name = open_payments_general.physician_last_name
AND dur_committee_members.state_name = open_payments_general.recipient_state': column dur_committee_members.state_name does not exist
LINE 31: AND dur_committee_members.state_name = open_payments_general...
             ^


In [42]:
general_payments["amount"] = pd.to_numeric(general_payments["amount"], errors="raise") # Convert the amount column to float data type
general_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21014 entries, 0 to 21013
Data columns (total 43 columns):
first_name             21012 non-null object
last_name              21014 non-null object
middle_initial         2953 non-null object
degree_1               19653 non-null object
degree_2               1872 non-null object
state                  21014 non-null object
dur_pt_other           21014 non-null object
info_date              21014 non-null object
date_minutes_or_web    21014 non-null object
industry_rep           21 non-null object
location               6249 non-null object
specialty              1832 non-null object
committee_alt_name     4930 non-null object
notes                  7346 non-null object
disclosure_received    4251 non-null object
conflict_disclosed     2235 non-null object
conflict_details       1686 non-null object
change_type            20323 non-null object
physician_id           20323 non-null object
first                  20323 non-null object
mid

In [43]:
general_payments.head()

Unnamed: 0,first_name,last_name,middle_initial,degree_1,degree_2,state,dur_pt_other,info_date,date_minutes_or_web,industry_rep,location,specialty,committee_alt_name,notes,disclosure_received,conflict_disclosed,conflict_details,change_type,physician_id,first,middle,last,city,state_name,specialty.1,manufacturer_id,manufacturer_name,amount,payment_date,payment_form,nature_of_payment,record_id,dispute_status,drug_device_1,ndc_1,drug_device_2,ndc_2,drug_device_3,ndc_3,drug_device_4,ndc_4,drug_device_5,ndc_5
0,DAVID,ELWELL,,MD,,CO,P&T,04/04/17,Minutes,,,,,,yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,JAMES,FEINSTEIN,,MD,,CO,P&T,01/05/16,Minutes,,,,,,yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,KERSTIN,FROYD,,MD,,CO,DUR,02/16/16,Minutes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,LEWAYNE,GARRISON,,RPh,,CO,DUR,02/16/16,Minutes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,MICHELLE,HILAIRE,,PharmD,,CO,P&T,04/04/17,Minutes,,,,,,yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Do we have duplicates in the data?

In [44]:
general_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
152466203    2
380096188    2
391060074    2
380096208    2
340912492    2
340912490    2
216452814    2
216452820    2
216452826    2
216452830    2
216452834    2
391274420    2
391274442    2
391274468    2
391274490    2
391274512    2
391274534    2
395501984    2
391274556    2
391274580    2
391274606    2
391274626    2
391274651    2
104616676    2
166669518    2
380096196    2
309879230    2
156466714    2
309879226    2
156466710    2
137834354    2
137827356    2
156358632    2
156358630    2
379924742    2
369928800    2
187780858    2
395915292    2
362593542    2
201767802    2
362613306    2
362624166    2
362625480    2
362630068    2
362631152    2
308852652    2
362634994    2
395749944    2
395749874    2
187906294    2
149553884    2
380096180    2
166669512    2
166669506    2
391406890    2
166669498    2
150145894    2
137686916    2
137685728    2
137685726    2
137684458    2
137683842    2
330599479    2
137683840    2
137682976    2
365949744    2


We do. Let's drop them.

In [45]:
general_payments.drop_duplicates("record_id", inplace=True)
general_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
9891599      1
207363190    1
207344180    1
207344728    1
207348488    1
207350178    1
207353240    1
207356232    1
207362752    1
207365098    1
207332926    1
207366898    1
207371980    1
207792532    1
207792538    1
207792542    1
207792546    1
207792548    1
207342926    1
207331720    1
206883076    1
207294556    1
207266400    1
207267764    1
207269564    1
207280234    1
207280424    1
207286232    1
207286552    1
207294690    1
207330652    1
207295384    1
207302986    1
207309264    1
207318844    1
207319746    1
207322526    1
207328590    1
207792552    1
207792556    1
207792560    1
208183058    1
208153788    1
208153976    1
208153978    1
208167538    1
208167540    1
208176238    1
208179934    1
208189204    1
207792562    1
208191932    1
208192024    1
208195510    1
208201196    1
208208560    1
208209506    1
208209986    1
208150812    1
208149920    1
208149918    1
208149916    1
207792566    1
207918980    1
207918982    1
207960846    1


What is the total value of general purpose payments to doctors serving on Medicaid drug utilization boards?

In [49]:
general_payments["amount"].sum()

2501139.0399999972

What percentage of doctors serving on Medicaid drug utilization boards received general purpose payments?

In [54]:
gp_payments_by_doc = general_payments.groupby(["state_name", "last_name", "first_name"])["amount"].sum().reset_index(name="total_amount")
gp_payments_by_doc.head()

Unnamed: 0,state,last_name,first_name,total_amount
0,AK,CARLSON,ROBERT,147.82
1,AK,MCANALLY,HEATH,496.28
2,AK,PAPPENHEIM,JOHN,113.8
3,AL,CARTER,LEE,3747.19
4,AL,DAWSON,ELIZABETH,15.48


In [51]:
gp_payments_by_doc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 4 columns):
state           297 non-null object
last_name       297 non-null object
first_name      297 non-null object
total_amount    296 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.4+ KB


OK. So, of the 998 doctors serving on Medicaid drug utilization boards, 297 — 29.8 percent — received general purpose payments.

Query the database to return all doctors serving on Medicaid drug utilization boards and, if applicable, what ownership or investment interests they held in drug manufacturers.

In [31]:
ownership_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_ownership.change_type AS change_type,
       open_payments_ownership.physician_profile_id AS physician_id,
       open_payments_ownership.physician_first_name AS first,
       open_payments_ownership.physician_middle_name AS middle,
       open_payments_ownership.physician_last_name AS last,
       open_payments_ownership.recipient_city AS city,
       open_payments_ownership.recipient_state AS state_name,
       open_payments_ownership.physician_specialty AS specialty,
       open_payments_ownership.total_amount_invested_usdollars AS total_amount_invested,
       open_payments_ownership.value_of_interest AS value_of_interest,
       open_payments_ownership.terms_of_interest AS terms_of_interest,
       open_payments_ownership.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_ownership.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_ownership.interest_held_by_physician_or_an_immediate_family_member AS nature_of_interest,
       open_payments_ownership.record_id AS record_id,
       open_payments_ownership.dispute_status_for_publication AS dispute_status
FROM dur_committee_members
LEFT JOIN open_payments_ownership ON dur_committee_members.first_name = open_payments_ownership.physician_first_name
AND dur_committee_members.last_name = open_payments_ownership.physician_last_name
AND dur_committee_members.state_name = open_payments_ownership.recipient_state""",
                           con=conn)

In [57]:
# Convert the investment amount columns to float data type
ownership_payments["total_amount_invested"] = pd.to_numeric(ownership_payments["total_amount_invested"], errors="raise")
ownership_payments["value_of_interest"] = pd.to_numeric(ownership_payments["value_of_interest"], errors="raise")
ownership_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 33 columns):
first_name               996 non-null object
last_name                998 non-null object
middle_initial           112 non-null object
degree_1                 926 non-null object
degree_2                 90 non-null object
state                    998 non-null object
dur_pt_other             998 non-null object
info_date                998 non-null object
date_minutes_or_web      998 non-null object
industry_rep             9 non-null object
location                 186 non-null object
specialty                54 non-null object
committee_alt_name       126 non-null object
notes                    351 non-null object
disclosure_received      114 non-null object
conflict_disclosed       27 non-null object
conflict_details         16 non-null object
change_type              2 non-null object
physician_id             2 non-null object
first                    2 non-null object
middle 

In [58]:
ownership_payments.head()

Unnamed: 0,first_name,last_name,middle_initial,degree_1,degree_2,state,dur_pt_other,info_date,date_minutes_or_web,industry_rep,location,specialty,committee_alt_name,notes,disclosure_received,conflict_disclosed,conflict_details,change_type,physician_id,first,middle,last,city,state_name,specialty.1,total_amount_invested,value_of_interest,terms_of_interest,manufacturer_id,manufacturer_name,nature_of_interest,record_id,dispute_status
0,ANDREAS,WALI,,MD,,PA,P&T,03/01/18,Web,,,cardiologist,,effective Oct 11 2017,,,,UNCHANGED,29544.0,ANDREAS,U,WALI,Camp Hill,PA,Allopathic & Osteopathic Physicians|Internal M...,19086.85,10607.38,Stock ownership,100000000133.0,TriReme Medical LLC,Physician Covered Recipient,413359278.0,No
1,SHARON,WEINSTEIN,,MD,,UT,DUR,12/08/16,Minutes,,,,,,,,,UNCHANGED,173760.0,SHARON,,WEINSTEIN,Salt Lake City,UT,Allopathic & Osteopathic Physicians|Psychiatry...,0.0,0.0,,100000005587.0,"DARA Biosciences, Inc.",Physician Covered Recipient,152335554.0,No
2,JAMES,MAGEE,,MD,,AR,DUR,02/27/18,Web,,,,,Listed on Magellan website at https://arkansas...,,,,,,,,,,,,,,,,,,,
3,AMBER,FIGUEROA,,DO,,WA,P&T,03/01/18,Web,,Wenatchee,family medicine,,Columbia Valley Community Health,,,,,,,,,,,,,,,,,,,
4,JEFFREY,PERRIN,,MD,,NC,DUR,03/01/18,Web,,,Pediatrics,,from email from virginia niehaus,,,,,,,,,,,,,,,,,,,


Do we have duplicates in the data?

In [59]:
ownership_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
413359278    1
152335554    1
Name: record_id, dtype: int64

What is the total value of the ownership or investment interests of doctors serving on Medicaid drug utilization boards?

In [60]:
ownership_payments["total_amount_invested"].sum()

19086.85

What percentage of doctors serving on Medicaid drug utilization boards held ownership or investment interests in drug manufacturers?

In [61]:
ownership_payments_by_doc = ownership_payments.groupby(["state_name", "last_name", "first_name"])["total_amount_invested"].sum().reset_index(name="total_amount")
ownership_payments_by_doc.head()

Unnamed: 0,state,last_name,first_name,total_amount
0,AK,CARLSON,ROBERT,
1,AK,DEMAIN,JEFFERY,
2,AK,GREEAR,VINCENT,
3,AK,HIESTAND,JENNA,
4,AK,LILJEGREN,DIANE,


In [62]:
ownership_payments_by_doc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 4 columns):
state           961 non-null object
last_name       961 non-null object
first_name      961 non-null object
total_amount    2 non-null float64
dtypes: float64(1), object(3)
memory usage: 30.1+ KB


Query the database to return all doctors serving on Medicaid drug utilization boards and, if applicable, what research payments they received from drug manufacturers.

In [30]:
research_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_research.change_type AS change_type,
       open_payments_research.teaching_hospital_name AS hospital,
       open_payments_research.physician_profile_id AS physician_id,
       open_payments_research.physician_first_name AS first,
       open_payments_research.physician_middle_name AS middle,
       open_payments_research.physician_last_name AS last,
       open_payments_research.recipient_city AS city,
       open_payments_research.recipient_state AS state,
       open_payments_research.physician_specialty AS specialty,
       open_payments_research.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_research.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_research.total_amount_of_payment_usdollars AS amount,
       open_payments_research.date_of_payment AS payment_date,
       open_payments_research.form_of_payment_or_transfer_of_value AS payment_form,
       open_payments_research.principal_investigator_1_first_name,
       open_payments_research.principal_investigator_1_middle_name,
       open_payments_research.principal_investigator_1_last_name,
       open_payments_research.principal_investigator_1_city,
       open_payments_research.principal_investigator_1_state,
       open_payments_research.principal_investigator_1_specialty,
       open_payments_research.principal_investigator_2_first_name,
       open_payments_research.principal_investigator_2_middle_name,
       open_payments_research.principal_investigator_2_last_name,
       open_payments_research.principal_investigator_2_city,
       open_payments_research.principal_investigator_2_state,
       open_payments_research.principal_investigator_2_specialty,
       open_payments_research.principal_investigator_3_first_name,
       open_payments_research.principal_investigator_3_middle_name,
       open_payments_research.principal_investigator_3_last_name,
       open_payments_research.principal_investigator_3_city,
       open_payments_research.principal_investigator_3_state,
       open_payments_research.principal_investigator_3_specialty,
       open_payments_research.principal_investigator_4_first_name,
       open_payments_research.principal_investigator_4_middle_name,
       open_payments_research.principal_investigator_4_last_name,
       open_payments_research.principal_investigator_4_city,
       open_payments_research.principal_investigator_4_state,
       open_payments_research.principal_investigator_4_specialty,
       open_payments_research.principal_investigator_5_first_name,
       open_payments_research.principal_investigator_5_middle_name,
       open_payments_research.principal_investigator_5_last_name,
       open_payments_research.principal_investigator_5_city,
       open_payments_research.principal_investigator_5_state,
       open_payments_research.principal_investigator_5_specialty,
       open_payments_research.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_1,
       open_payments_research.associated_drug_or_biological_ndc_1 AS ndc_1,
       open_payments_research.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_2,
       open_payments_research.associated_drug_or_biological_ndc_1 AS ndc_2,
       open_payments_research.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_3,
       open_payments_research.associated_drug_or_biological_ndc_1 AS ndc_3,
       open_payments_research.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_4,
       open_payments_research.associated_drug_or_biological_ndc_1 AS ndc_4,
       open_payments_research.name_of_drug_or_biological_or_device_or_medical_supply_1 AS drug_device_5,
       open_payments_research.associated_drug_or_biological_ndc_1 AS ndc_5,
       open_payments_research.expenditure_category1 AS expenditure_category_1,
       open_payments_research.expenditure_category2 AS expenditure_category_2,
       open_payments_research.expenditure_category3 AS expenditure_category_3,
       open_payments_research.expenditure_category4 AS expenditure_category_4,
       open_payments_research.expenditure_category5 AS expenditure_category_5,
       open_payments_research.expenditure_category6 AS expenditure_category_6,
       open_payments_research.dispute_status_for_publication AS dispute_status,
       open_payments_research.record_id AS record_id
FROM dur_committee_members
LEFT JOIN open_payments_research ON dur_committee_members.first_name = open_payments_research.physician_first_name
AND dur_committee_members.last_name = open_payments_research.physician_last_name
AND dur_committee_members.state_name = open_payments_research.recipient_state
OR dur_committee_members.first_name = open_payments_research.principal_investigator_1_first_name
AND dur_committee_members.last_name = open_payments_research.principal_investigator_1_last_name
AND dur_committee_members.state_name = open_payments_research.principal_investigator_1_state
OR dur_committee_members.first_name = open_payments_research.principal_investigator_2_first_name
AND dur_committee_members.last_name = open_payments_research.principal_investigator_2_last_name
AND dur_committee_members.state_name = open_payments_research.principal_investigator_2_state
OR dur_committee_members.first_name = open_payments_research.principal_investigator_3_first_name
AND dur_committee_members.last_name = open_payments_research.principal_investigator_3_last_name
AND dur_committee_members.state_name = open_payments_research.principal_investigator_3_state
OR dur_committee_members.first_name = open_payments_research.principal_investigator_4_first_name
AND dur_committee_members.last_name = open_payments_research.principal_investigator_4_last_name
AND dur_committee_members.state_name = open_payments_research.principal_investigator_4_state
OR dur_committee_members.first_name = open_payments_research.principal_investigator_5_first_name
AND dur_committee_members.last_name = open_payments_research.principal_investigator_5_last_name
AND dur_committee_members.state_name = open_payments_research.principal_investigator_5_state""",
                           con=conn)

In [19]:
research_payments["amount"] = pd.to_numeric(research_payments["amount"], errors="raise") # Convert the amount column to float data type
research_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5193 entries, 0 to 5192
Data columns (total 79 columns):
first_name                              5191 non-null object
last_name                               5193 non-null object
middle_initial                          160 non-null object
degree_1                                5113 non-null object
degree_2                                135 non-null object
state                                   5193 non-null object
dur_pt_other                            5193 non-null object
info_date                               5193 non-null object
date_minutes_or_web                     5193 non-null object
industry_rep                            9 non-null object
location                                806 non-null object
specialty                               898 non-null object
committee_alt_name                      690 non-null object
notes                                   1356 non-null object
disclosure_received                     179 non

In [21]:
research_payments.head()

Unnamed: 0,first_name,last_name,middle_initial,degree_1,degree_2,state,dur_pt_other,info_date,date_minutes_or_web,industry_rep,location,specialty,committee_alt_name,notes,disclosure_received,conflict_disclosed,conflict_details,change_type,hospital,physician_id,first,middle,last,city,state.1,...,principal_investigator_4_specialty,principal_investigator_5_first_name,principal_investigator_5_middle_name,principal_investigator_5_last_name,principal_investigator_5_city,principal_investigator_5_state,principal_investigator_5_specialty,drug_device_1,ndc_1,drug_device_2,ndc_2,drug_device_3,ndc_3,drug_device_4,ndc_4,drug_device_5,ndc_5,expenditure_category_1,expenditure_category_2,expenditure_category_3,expenditure_category_4,expenditure_category_5,expenditure_category_6,dispute_status,record_id
0,DAVID,ELWELL,,MD,,CO,P&T,04/04/17,Minutes,,,,,,yes,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,JAMES,FEINSTEIN,,MD,,CO,P&T,01/05/16,Minutes,,,,,,yes,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,KERSTIN,FROYD,,MD,,CO,DUR,02/16/16,Minutes,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,LEWAYNE,GARRISON,,RPh,,CO,DUR,02/16/16,Minutes,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,MICHELLE,HILAIRE,,PharmD,,CO,P&T,04/04/17,Minutes,,,,,,yes,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


Do we have duplicates in the data?

In [22]:
research_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
4806114      2
306957892    2
307928734    2
307928736    2
307928738    2
307928740    2
307928742    2
307928744    2
307928746    2
307928748    2
307928750    2
307928752    2
307928754    2
307928756    2
307928758    2
307928760    2
307928762    2
307928764    2
307928766    2
307928768    2
307928770    2
307928772    2
307928774    2
307928732    2
307928730    2
307928728    2
307928704    2
306962320    2
306971176    2
306971188    2
417010363    2
306971212    2
306971898    2
306973439    2
307928700    2
307928702    2
307928706    2
307928726    2
307928708    2
307928710    2
307928712    2
307928714    2
307928716    2
307928718    2
307928720    2
307928722    2
307928724    2
307928776    2
307928778    2
307928780    2
322417522    2
31505278     2
31505279     2
318728403    2
318807722    2
318807783    2
318848835    2
318848862    2
318854104    2
318854122    2
322427464    2
31505271     2
322435603    2
322440372    2
322456912    2
322472184    2


We do. Let's drop them.

In [23]:
research_payments.drop_duplicates("record_id", inplace=True)
research_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
4806114      1
211613342    1
211635274    1
211633546    1
211633530    1
211613526    1
211613502    1
211613496    1
211613494    1
211613484    1
211613442    1
211613340    1
211640124    1
211601176    1
211601108    1
211601104    1
211601100    1
211601094    1
211601092    1
211601070    1
211601066    1
211601064    1
211635316    1
211640132    1
211655088    1
211640304    1
211640362    1
211640360    1
211640350    1
211640348    1
211640344    1
211640336    1
211640330    1
211640320    1
211640318    1
211640288    1
211640136    1
211640284    1
211640272    1
211640246    1
211640202    1
211640178    1
211640172    1
211640152    1
211640148    1
211640140    1
211601060    1
211601057    1
211601054    1
208673472    1
208678374    1
208678370    1
208678366    1
208678362    1
208678360    1
208678356    1
208678352    1
208675020    1
208673644    1
208672902    1
211601052    1
208668830    1
208664100    1
208663858    1
208663610    1
208663338    1


What is the total value of research payments to doctors serving on Medicaid drug utilization boards?

In [25]:
research_payments["amount"].sum()

11010560.920000006

Export the dataframes to Excel file for further analysis.

In [26]:
writer = pd.ExcelWriter("data/doc_payments.xlsx")
general_payments.to_excel(writer, "general_payments", index=False)
ownership_payments.to_excel(writer, "ownership_payments", index=False)
research_payments.to_excel(writer, "research_payments", index=False)
writer.save()