In [1]:
import json
import numpy as np
import pandas as pd
import psycopg2
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)
pd.set_option("display.float_format", lambda x: "%.2f" % x) # Suppress scientific notation for float data type

## Which drugs are states buying most frequently?

First, we need to determine which drugs appear to be outliers as measured by the number of drugs purchased in each state. To get started, query the Medicaid API to return the drug name, the state that bought the drugs and the total number of drugs purchased in 2016.

In [2]:
query = "https://data.medicaid.gov/resource/neai-csgh.json?$select=state_code,product_fda_list_name,sum(units_reimbursed)&$where=suppression_used=False%20and%20not%20state_code='XX'&$group=state_code,product_fda_list_name&$limit=4625479&$$app_token=v3AK8nRjxbWjtmIBGHJ9OmMlb"
units = pd.read_json(query)
units.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95185 entries, 0 to 95184
Data columns (total 3 columns):
product_fda_list_name    95183 non-null object
state_code               95185 non-null object
sum_units_reimbursed     95185 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.9+ MB


In [3]:
units.head()

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed
0,ZINC OXIDE,KY,417530.9
1,RAVICTI,TN,15775.0
2,BICILLIN L,IN,1895.51
3,Tramadol H,WA,37041.0
4,NAPROXEN 3,NV,8678.0


Some of our drug names are fully capitalized. Others are not. Since we'll eventually group on that column, we need to standardize that.

In [4]:
units["product_fda_list_name"] = units["product_fda_list_name"].str.upper()
units.head()

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed
0,ZINC OXIDE,KY,417530.9
1,RAVICTI,TN,15775.0
2,BICILLIN L,IN,1895.51
3,TRAMADOL H,WA,37041.0
4,NAPROXEN 3,NV,8678.0


Rank the drugs by their units reimbursed within each state.

In [5]:
units["rank"] = units.groupby("state_code")["sum_units_reimbursed"].rank(method="min", ascending=False).astype(int)
units.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95185 entries, 0 to 95184
Data columns (total 4 columns):
product_fda_list_name    95183 non-null object
state_code               95185 non-null object
sum_units_reimbursed     95185 non-null float64
rank                     95185 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


Create a new dataframe with the top 10 drugs in each state.

In [6]:
top_10_units = units[units["rank"] <= 10]
top_10_units.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 73 to 95170
Data columns (total 4 columns):
product_fda_list_name    510 non-null object
state_code               510 non-null object
sum_units_reimbursed     510 non-null float64
rank                     510 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 19.9+ KB


How many times does each drug appear in a state's top-10 list?

In [7]:
counts_units = top_10_units["product_fda_list_name"].value_counts().reset_index() # Create new dataframe of drug counts
counts_units.columns = ["product_fda_list_name", "count"] # Rename columns
counts_units.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 2 columns):
product_fda_list_name    62 non-null object
count                    62 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ KB


Merge the dataframes into a single dataframe with both ranks and counts.

In [8]:
top_10_units = top_10_units.merge(counts_units, how="inner", on="product_fda_list_name")
top_10_units.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 0 to 509
Data columns (total 5 columns):
product_fda_list_name    510 non-null object
state_code               510 non-null object
sum_units_reimbursed     510 non-null float64
rank                     510 non-null int64
count                    510 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 23.9+ KB


Which drugs appear in only a single state's top-10 list?

In [9]:
outliers_units = top_10_units[top_10_units["count"] == 1]
outliers_units.sort_values("product_fda_list_name", ascending=True)

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed,rank,count
224,ADVATE 5ML,NV,8649028.0,8,1
500,ALPRAZOLAM,MO,9809834.31,10,1
430,AMLODIPINE,DC,2355515.13,10,1
485,AMMONIUM L,NY,61596452.85,8,1
444,BROMFED DM,TX,62004362.99,7,1
493,BUPROPION,VT,1363162.5,9,1
497,CHILDREN I,TX,77692355.83,6,1
349,CLONAZEPAM,RI,2107665.0,9,1
501,DEXTROAMP-,MA,14595305.5,9,1
486,DEXTROSE 5,WV,12566525.0,8,1


Export the outliers data as an Excel file.

In [10]:
outliers_units.to_excel("data/outliers_units.xlsx")

## Which drugs are states spending the most money on?

So far, we've determined which drugs appear to be outliers as measured by the number of drugs purchased in each state. We now need to determine which drugs appear to be outliers as measured by the total amount of money spent on the drugs. To get started, query the Medicaid API to return the drug name, the state that bought the drugs and the total amount reimbursed for drugs purchased in 2016.

In [11]:
query = "https://data.medicaid.gov/resource/neai-csgh.json?$select=state_code,product_fda_list_name,sum(total_amount_reimbursed)&$where=suppression_used=False%20and%20not%20state_code='XX'&$group=state_code,product_fda_list_name&$limit=4625479&$$app_token=v3AK8nRjxbWjtmIBGHJ9OmMlb"
amount = pd.read_json(query)
amount.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95185 entries, 0 to 95184
Data columns (total 3 columns):
product_fda_list_name          95183 non-null object
state_code                     95185 non-null object
sum_total_amount_reimbursed    95185 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.9+ MB


In [12]:
amount.head()

Unnamed: 0,product_fda_list_name,state_code,sum_total_amount_reimbursed
0,ZINC OXIDE,KY,44152.93
1,RAVICTI,TN,2396199.88
2,BICILLIN L,IN,48881.32
3,Tramadol H,WA,15972.25
4,NAPROXEN 3,NV,1353.14


Some of our drug names are fully capitalized. Others are not. Since we'll eventually group on that column, we need to standardize that.

In [13]:
amount["product_fda_list_name"] = amount["product_fda_list_name"].str.upper()
amount.head()

Unnamed: 0,product_fda_list_name,state_code,sum_total_amount_reimbursed
0,ZINC OXIDE,KY,44152.93
1,RAVICTI,TN,2396199.88
2,BICILLIN L,IN,48881.32
3,TRAMADOL H,WA,15972.25
4,NAPROXEN 3,NV,1353.14


Rank the drugs by their amount reimbursed within each state.

In [14]:
amount["rank"] = amount.groupby("state_code")["sum_total_amount_reimbursed"].rank(method="min", ascending=False).astype(int)
amount.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95185 entries, 0 to 95184
Data columns (total 4 columns):
product_fda_list_name          95183 non-null object
state_code                     95185 non-null object
sum_total_amount_reimbursed    95185 non-null float64
rank                           95185 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


Create a new dataframe with the top 10 drugs in each state.

In [15]:
top_10_amount = amount[amount["rank"] <= 10]
top_10_amount.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 171 to 94892
Data columns (total 4 columns):
product_fda_list_name          510 non-null object
state_code                     510 non-null object
sum_total_amount_reimbursed    510 non-null float64
rank                           510 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 19.9+ KB


How many times does each drug appear in a state's top-10 list?

In [16]:
counts_amount = top_10_amount["product_fda_list_name"].value_counts().reset_index() # Create new dataframe of drug counts
counts_amount.columns = ["product_fda_list_name", "count"] # Rename columns
counts_amount.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 2 columns):
product_fda_list_name    110 non-null object
count                    110 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


Merge the dataframes into a single dataframe with both ranks and counts.

In [17]:
top_10_amount = top_10_amount.merge(counts_amount, how="inner", on="product_fda_list_name")
top_10_amount.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 0 to 509
Data columns (total 5 columns):
product_fda_list_name          510 non-null object
state_code                     510 non-null object
sum_total_amount_reimbursed    510 non-null float64
rank                           510 non-null int64
count                          510 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 23.9+ KB


Which drugs appear in only a single state's top-10 list?

In [18]:
outliers_amount = top_10_amount[top_10_amount["count"] == 1]
outliers_amount.sort_values("product_fda_list_name", ascending=True)

Unnamed: 0,product_fda_list_name,state_code,sum_total_amount_reimbursed,rank,count
411,ABILIFY 10,MT,2534711.31,7,1
366,ADVAIR HFA,ME,3750001.53,9,1
212,ADVATE 5ML,NV,12833381.3,3,1
404,ARANESP (D,SD,1961736.07,9,1
501,COMPLERA,NJ,20757871.33,10,1
417,COMPLERA T,DC,5734734.51,9,1
34,DEXAMETHAS,VA,11047921.43,10,1
503,DEXTROAMP-,MA,24321167.39,6,1
418,DULERA INH,AL,12122809.43,6,1
499,EPCLUSA,NH,2185523.57,7,1


Export the outliers data as an Excel file.

In [19]:
outliers_amount.to_excel("data/outliers_amount.xlsx")

## How many doctors on Medicaid drug utilization boards have received payments from drug manufacturers?

We will now use Open Payments data to determine the extent of payments from 2013 through 2016 from drug manufacturers to doctors serving on Medicaid drug utilization boards between the start of 2016 and March of 2018.

First, connect to the database.

In [20]:
with open("config.json") as f:
    conf = json.load(f)

In [21]:
conn_str = "host={} dbname={} user={} password={}".format(conf["host"], conf["database"], conf["user"], conf["password"])

In [22]:
conn = psycopg2.connect(conn_str)

Query the database to return the doctors who received any general purpose payments from drug manufacturers.

In [23]:
general_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_general.change_type AS change_type,
       open_payments_general.physician_profile_id AS physician_id,
       open_payments_general.physician_first_name AS first,
       open_payments_general.physician_middle_name AS middle,
       open_payments_general.physician_last_name AS last,
       open_payments_general.recipient_city AS city,
       open_payments_general.recipient_state AS state,
       open_payments_general.physician_specialty AS specialty,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_general.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_general.total_amount_of_payment_usdollars AS amount,
       open_payments_general.date_of_payment AS payment_date,
       open_payments_general.form_of_payment_or_transfer_of_value AS payment_form,
       open_payments_general.nature_of_payment_or_transfer_of_value AS nature_of_payment,
       open_payments_general.record_ID AS record_id,
       open_payments_general.dispute_status_for_publication AS dispute_status
FROM dur_committee_members
INNER JOIN open_payments_general ON UPPER(dur_committee_members.first_name) = UPPER(open_payments_general.physician_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_general.physician_last_name)
AND UPPER(dur_committee_members.state) = UPPER(open_payments_general.recipient_state)""",
                           con=conn)

In [24]:
general_payments["amount"] = pd.to_numeric(general_payments["amount"], errors="raise") # Convert the amount column to float data type
general_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17509 entries, 0 to 17508
Data columns (total 33 columns):
first_name             17509 non-null object
last_name              17509 non-null object
middle_initial         2796 non-null object
degree_1               15321 non-null object
degree_2               1593 non-null object
state                  17509 non-null object
dur_pt_other           17509 non-null object
info_date              17509 non-null object
date_minutes_or_web    17509 non-null object
industry_rep           13 non-null object
location               5514 non-null object
specialty              1636 non-null object
committee_alt_name     3664 non-null object
notes                  5500 non-null object
disclosure_received    2745 non-null object
conflict_disclosed     799 non-null object
conflict_details       799 non-null object
change_type            17509 non-null object
physician_id           17509 non-null object
first                  17509 non-null object
middl

Do we have duplicates in the data?

In [25]:
general_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
325991674    2
137834354    2
216452820    2
216452826    2
216452830    2
216452834    2
380096196    2
380096208    2
246095420    2
246095418    2
104616676    2
149553884    2
137827356    2
380096188    2
369928800    2
340912492    2
104790966    2
340912490    2
281944736    2
337396746    2
202279132    2
137689458    2
137689144    2
137689130    2
216452814    2
187906294    2
330599479    2
166669506    2
415262155    2
201767802    2
206854068    2
187780858    2
156358632    2
156358630    2
166669518    2
294940972    2
395501984    2
166669512    2
166669498    2
294771614    2
166669492    2
166669486    2
166669478    2
166669472    2
166669464    2
294833024    2
166669456    2
166669450    2
166669445    2
166669437    2
137689128    2
137688468    2
391060074    2
137674342    2
137686916    2
137685728    2
391406890    2
137685726    2
137684458    2
137683842    2
394606924    2
137683840    2
137682976    2
137674344    2
137674340    2
137688466    2


We do. Let's drop them.

In [26]:
general_payments.drop_duplicates("record_id", inplace=True)
general_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
9882196      1
207792548    1
208144906    1
208138416    1
208138414    1
208138412    1
207997884    1
207997882    1
207997878    1
207918982    1
207918980    1
207792566    1
207792562    1
207792560    1
207792556    1
207792552    1
207792546    1
208149914    1
207792542    1
207792538    1
207792532    1
207371980    1
207366898    1
207365098    1
207363190    1
207362752    1
207356232    1
207353240    1
207350178    1
207344728    1
207344180    1
207342926    1
208149912    1
208149916    1
208237622    1
208208560    1
208231912    1
208224374    1
208224372    1
208224370    1
208223552    1
208223550    1
208223548    1
208223546    1
208223544    1
208221682    1
208212522    1
208211756    1
208209986    1
208209506    1
208201196    1
208149918    1
208195510    1
208192024    1
208191932    1
208189204    1
208183058    1
208179934    1
208176238    1
208167540    1
208167538    1
208153978    1
208153976    1
208153788    1
208150812    1
208149920    1


What is the total value of general purpose payments to doctors serving on Medicaid drug utilization boards?

In [27]:
general_payments["amount"].sum()

2156103.760000009

Query the database to return the doctors who held an ownership or investment interest in any drug manufacturers.

In [28]:
ownership_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_ownership.change_type AS change_type,
       open_payments_ownership.physician_profile_id AS physician_id,
       open_payments_ownership.physician_first_name AS first,
       open_payments_ownership.physician_middle_name AS middle,
       open_payments_ownership.physician_last_name AS last,
       open_payments_ownership.recipient_city AS city,
       open_payments_ownership.recipient_state AS state,
       open_payments_ownership.physician_specialty AS specialty,
       open_payments_ownership.total_amount_invested_usdollars AS total_amount_invested,
       open_payments_ownership.value_of_interest AS value_of_interest,
       open_payments_ownership.terms_of_interest AS terms_of_interest,
       open_payments_ownership.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_ownership.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_ownership.interest_held_by_physician_or_an_immediate_family_member AS nature_of_interest,
       open_payments_ownership.record_id AS record_id,
       open_payments_ownership.dispute_status_for_publication AS dispute_status
FROM dur_committee_members
INNER JOIN open_payments_ownership ON UPPER(dur_committee_members.first_name) = UPPER(open_payments_ownership.physician_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_ownership.physician_last_name)
AND UPPER(dur_committee_members.state) = UPPER(open_payments_ownership.recipient_state)""",
                           con=conn)

In [29]:
# Convert the investment amount columns to float data type
ownership_payments["total_amount_invested"] = pd.to_numeric(ownership_payments["total_amount_invested"], errors="raise")
ownership_payments["value_of_interest"] = pd.to_numeric(ownership_payments["value_of_interest"], errors="raise")
ownership_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 33 columns):
first_name               2 non-null object
last_name                2 non-null object
middle_initial           0 non-null object
degree_1                 2 non-null object
degree_2                 0 non-null object
state                    2 non-null object
dur_pt_other             2 non-null object
info_date                2 non-null object
date_minutes_or_web      2 non-null object
industry_rep             0 non-null object
location                 0 non-null object
specialty                1 non-null object
committee_alt_name       0 non-null object
notes                    1 non-null object
disclosure_received      0 non-null object
conflict_disclosed       0 non-null object
conflict_details         0 non-null object
change_type              2 non-null object
physician_id             2 non-null object
first                    2 non-null object
middle                   1 non-null obj

Do we have duplicates in the data?

In [30]:
ownership_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
413359278    1
152335554    1
Name: record_id, dtype: int64

In [31]:
ownership_payments

Unnamed: 0,first_name,last_name,middle_initial,degree_1,degree_2,state,dur_pt_other,info_date,date_minutes_or_web,industry_rep,location,specialty,committee_alt_name,notes,disclosure_received,conflict_disclosed,conflict_details,change_type,physician_id,first,middle,last,city,state.1,specialty.1,total_amount_invested,value_of_interest,terms_of_interest,manufacturer_id,manufacturer_name,nature_of_interest,record_id,dispute_status
0,Andreas,Wali,,MD,,PA,P&T,3/1/18,From Web,,,cardiologist,,effective Oct 11 2017,,,,UNCHANGED,29544,Andreas,U,Wali,Camp Hill,PA,Allopathic & Osteopathic Physicians|Internal M...,19086.85,10607.38,Stock ownership,100000000133,TriReme Medical LLC,Physician Covered Recipient,413359278,No
1,Sharon,Weinstein,,MD,,UT,DUR,12/8/16,From Minutes,,,,,,,,,UNCHANGED,173760,Sharon,,Weinstein,Salt Lake City,UT,Allopathic & Osteopathic Physicians|Psychiatry...,0.0,0.0,,100000005587,"DARA Biosciences, Inc.",Physician Covered Recipient,152335554,No


What is the total value of the ownership or investment interest of doctors serving on Medicaid drug utilization boards?

In [32]:
ownership_payments["total_amount_invested"].sum()

19086.85

Query the database to return the doctors who received any research payments from drug manufacturers.

In [33]:
research_payments = pd.read_sql("""SELECT dur_committee_members.*,
       open_payments_research.change_type AS change_type,
       open_payments_research.teaching_hospital_name AS hospital,
       open_payments_research.physician_profile_id AS physician_id,
       open_payments_research.physician_first_name AS first,
       open_payments_research.physician_middle_name AS middle,
       open_payments_research.physician_last_name AS last,
       open_payments_research.recipient_city AS city,
       open_payments_research.recipient_state AS state,
       open_payments_research.physician_specialty AS specialty,
       open_payments_research.applicable_manufacturer_or_applicable_gpo_making_payment_id AS manufacturer_id,
       open_payments_research.applicable_manufacturer_or_applicable_gpo_making_payment_name AS manufacturer_name,
       open_payments_research.total_amount_of_payment_usdollars AS amount,
       open_payments_research.date_of_payment AS payment_date,
       open_payments_research.form_of_payment_or_transfer_of_value AS payment_form,
       principal_investigator_1_first_name,
       principal_investigator_1_middle_name,
       principal_investigator_1_last_name,
       principal_investigator_1_city,
       principal_investigator_1_state,
       principal_investigator_1_specialty,
       principal_investigator_2_first_name,
       principal_investigator_2_middle_name,
       principal_investigator_2_last_name,
       principal_investigator_2_city,
       principal_investigator_2_state,
       principal_investigator_2_specialty,
       principal_investigator_3_first_name,
       principal_investigator_3_middle_name,
       principal_investigator_3_last_name,
       principal_investigator_3_city,
       principal_investigator_3_state,
       principal_investigator_3_specialty,
       principal_investigator_4_first_name,
       principal_investigator_4_middle_name,
       principal_investigator_4_last_name,
       principal_investigator_4_city,
       principal_investigator_4_state,
       principal_investigator_4_specialty,
       principal_investigator_5_first_name,
       principal_investigator_5_middle_name,
       principal_investigator_5_last_name,
       principal_investigator_5_city,
       principal_investigator_5_state,
       principal_investigator_5_specialty,
       open_payments_research.expenditure_category1 AS expenditure_category_1,
       open_payments_research.expenditure_category2 AS expenditure_category_2,
       open_payments_research.expenditure_category3 AS expenditure_category_3,
       open_payments_research.expenditure_category4 AS expenditure_category_4,
       open_payments_research.expenditure_category5 AS expenditure_category_5,
       open_payments_research.expenditure_category6 AS expenditure_category_6,
       open_payments_research.dispute_status_for_publication AS dispute_status,
       open_payments_research.record_id AS record_id
FROM dur_committee_members
INNER JOIN open_payments_research ON UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.physician_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.physician_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.recipient_state)
OR UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.principal_investigator_1_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.principal_investigator_1_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.principal_investigator_1_state)
OR UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.principal_investigator_2_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.principal_investigator_2_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.principal_investigator_2_state)
OR UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.principal_investigator_3_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.principal_investigator_3_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.principal_investigator_3_state)
OR UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.principal_investigator_4_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.principal_investigator_4_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.principal_investigator_4_state)
OR UPPER(dur_committee_members.first_name) = UPPER(open_payments_research.principal_investigator_5_first_name)
AND UPPER(dur_committee_members.last_name) = UPPER(open_payments_research.principal_investigator_5_last_name)
AND UPPER(dur_committee_members.STATE) = UPPER(open_payments_research.principal_investigator_5_state)""",
                           con=conn)

In [34]:
research_payments["amount"] = pd.to_numeric(research_payments["amount"], errors="raise") # Convert the amount column to float data type
research_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4218 entries, 0 to 4217
Data columns (total 69 columns):
first_name                              4218 non-null object
last_name                               4218 non-null object
middle_initial                          52 non-null object
degree_1                                4195 non-null object
degree_2                                48 non-null object
state                                   4218 non-null object
dur_pt_other                            4218 non-null object
info_date                               4218 non-null object
date_minutes_or_web                     4218 non-null object
industry_rep                            0 non-null object
location                                627 non-null object
specialty                               848 non-null object
committee_alt_name                      559 non-null object
notes                                   1010 non-null object
disclosure_received                     55 non-nu

Do we have duplicates in the data?

In [35]:
research_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
4806114      2
297139802    2
307928732    2
307928734    2
307928736    2
307928738    2
307928740    2
307928742    2
307928744    2
307928746    2
307928748    2
307928750    2
307928752    2
307928754    2
307928756    2
307928758    2
307928760    2
307928762    2
307928764    2
307928766    2
307928768    2
307928770    2
307928772    2
307928730    2
307928728    2
307928726    2
307928702    2
306961858    2
306962320    2
417010363    2
306971188    2
306971200    2
306971212    2
306971898    2
306973439    2
307928700    2
307928704    2
307928724    2
307928706    2
307928708    2
307928710    2
307928712    2
307928714    2
307928716    2
307928718    2
307928720    2
307928722    2
307928774    2
307928776    2
307928778    2
318854122    2
31505273     2
31505278     2
31505279     2
318728403    2
318807722    2
318807783    2
318848835    2
318848862    2
318854104    2
322417522    2
31505269     2
322427464    2
322435603    2
322440372    2
322456912    2


We do. Let's drop them.

In [36]:
research_payments.drop_duplicates("record_id", inplace=True)
research_payments.groupby("record_id")["record_id"].count().sort_values(ascending=False)

record_id
4806114      1
211601176    1
211633530    1
211613526    1
211613502    1
211613496    1
211613494    1
211613484    1
211613442    1
211613342    1
211613340    1
211601108    1
211635274    1
211601104    1
211601100    1
211601094    1
211601092    1
211601070    1
211601066    1
211601064    1
211601060    1
211601057    1
211633546    1
211635316    1
211640362    1
211640284    1
211640350    1
211640348    1
211640344    1
211640336    1
211640330    1
211640320    1
211640318    1
211640304    1
211640288    1
211640272    1
211640124    1
211640246    1
211640202    1
211640178    1
211640172    1
211640152    1
211640148    1
211640140    1
211640136    1
211640132    1
211601054    1
211601052    1
211601028    1
208664100    1
208678366    1
208678362    1
208678360    1
208678356    1
208678352    1
208673644    1
208673472    1
208672902    1
208668830    1
208663858    1
211600944    1
208663610    1
208663338    1
208663322    1
208662656    1
208662630    1


In [37]:
research_payments

Unnamed: 0,first_name,last_name,middle_initial,degree_1,degree_2,state,dur_pt_other,info_date,date_minutes_or_web,industry_rep,location,specialty,committee_alt_name,notes,disclosure_received,conflict_disclosed,conflict_details,change_type,hospital,physician_id,first,middle,last,city,state.1,...,principal_investigator_3_middle_name,principal_investigator_3_last_name,principal_investigator_3_city,principal_investigator_3_state,principal_investigator_3_specialty,principal_investigator_4_first_name,principal_investigator_4_middle_name,principal_investigator_4_last_name,principal_investigator_4_city,principal_investigator_4_state,principal_investigator_4_specialty,principal_investigator_5_first_name,principal_investigator_5_middle_name,principal_investigator_5_last_name,principal_investigator_5_city,principal_investigator_5_state,principal_investigator_5_specialty,expenditure_category_1,expenditure_category_2,expenditure_category_3,expenditure_category_4,expenditure_category_5,expenditure_category_6,dispute_status,record_id
0,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290690
1,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290692
2,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290694
3,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290696
4,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290698
5,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290700
6,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,230130,FRANK,S,PETTYJOHN,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,362290702
7,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,,,,,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,212935554
8,Frank,Pettyjohn,,,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,,,,,MOBILE,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,24457374
9,Denyse,Thornley-Brown,,MD,,AL,DUR,10/26/16,From Minutes,,,,,,,,,UNCHANGED,,870258,DENYSE,P,THORNLEY-BROWN,BIRMINGHAM,AL,...,,,,,,,,,,,,,,,,,,,,,,,,No,392936230


What is the total value of research payments to doctors serving on Medicaid drug utilization boards?

In [38]:
research_payments["amount"].sum()

10970408.01000001

Export the dataframes to Excel file for further analysis.

In [39]:
writer = pd.ExcelWriter("data/doc_payments.xlsx")
general_payments.to_excel(writer, "general_payments", index=False)
ownership_payments.to_excel(writer, "ownership_payments", index=False)
research_payments.to_excel(writer, "research_payments", index=False)
writer.save()