# Libraries

In [2]:
import requests

import numpy as np
import pandas as pd

import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval

import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdate

from linearmodels import PooledOLS
import statsmodels.api as sm

pd.set_option('display.max_columns', None) # display all columns in DF

# Loading Data

In [3]:
df_reg_org = pd.read_csv('raw_consolidation_data.csv', sep = '~')

In [4]:
df_reg_org.head(2)

Unnamed: 0,date,unique_id,marketing_category,brand_name,generic_name,labeler_name,pharm_class,is_original_packager,units_reimbursed,total_amount_reimbursed_adj,price_per_unit_adj
0,1991-01-01,acetaminophen-tablet-oral-300 mg/1,ANDA,Acetaminophen and Codeine Phosphate,ACETAMINOPHEN,"Teva Pharmaceuticals USA, Inc.","['Full Opioid Agonists [MoA]', 'Opioid Agonist...",True,2599944.0,1554268.0,0.604724
1,1991-01-01,acetaminophen-tablet-oral-325 mg/1,ANDA,"butalbital, acetaminophen and caffeine",ACETAMINOPHEN,"Mikart, LLC","['Barbiturate [EPC]', 'Barbiturates [CS]', 'Ce...",True,276227.6,178515.1,0.646261


# Data manipulation

Removing labeller that are original packager

In [5]:
df_reg = df_reg_org.loc[df_reg_org['is_original_packager'] == True]

In [6]:
print(len(df_reg))

551764


### Assigning order of entrance for each unique drug

In [7]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date', 'marketing_category'], ascending = [False, True, False], ignore_index = True) # Sorting

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'labeler_name'], keep = 'first') # Finding the first entrance for each labeller 

df_temp['labeler_name_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['unique_id', 'labeler_name', 'labeler_name_count']], df_reg, on = ['unique_id', 'labeler_name'], how = 'right') # Merging with org. data

### Assigning a dummy for first entrance and NDA

In [8]:
df_temp = df_reg.loc[(df_reg['labeler_name_count'] == 1) & (df_reg['marketing_category'] == 'NDA')] # Filtering

df_temp = df_temp.sort_values(by = ['date', 'unique_id'], ascending = [True, False], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['unique_id'], keep = 'first') # Removing dublicates

df_temp['first_nda'] = 1 # Adding dummy

df_reg = pd.merge(df_temp[['unique_id', 'first_nda']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['first_nda'] = df_reg['first_nda'].fillna(0)

Removing unique drug where the first entrance is not a NDA

In [9]:
df_reg = df_reg.loc[(df_reg['first_nda'] == 1)]

### Assigning a running count for each unique drug from the start

In [10]:
df_temp = df_reg[df_reg['labeler_name_count'] == 1]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_start'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_start']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Assigning a running count for each unique drug from second entrance

In [11]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_second_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_second_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_second_entrance'] = df_reg['running_count_from_second_entrance'].fillna(0)

### Assigning a total count of the labelers at the start (to exclude unique drug with multiple labelers from the start)

In [12]:
df_temp = df_reg.loc[(df_reg['running_count_from_start'] == 1)]
          
df_temp = df_temp.groupby(['unique_id'])['first_nda'].sum()
df_temp = df_temp.reset_index()
df_temp = df_temp.rename(columns = {"first_nda": "number_of_first_entrance"})

df_temp = df_temp.loc[(df_temp['number_of_first_entrance'] > 1)]

df_reg = pd.merge(df_temp[['unique_id', 'number_of_first_entrance']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['number_of_first_entrance'] = df_reg['number_of_first_entrance'].fillna(1)

Removing unique drug which has multiple entrances at start

In [13]:
df_reg = df_reg.loc[(df_reg['number_of_first_entrance'] == 1)]

### Assigning a running count of generic labelers

In [14]:
df_temp = df_reg[['date', 'unique_id', 'labeler_name', 'labeler_name_count']]

df_temp = df_temp.groupby(['date', 'unique_id'])

df_temp = df_temp.max()

df_temp = df_temp.reset_index()

df_temp['running_count_generics'] = df_temp['labeler_name_count'] - 1 # Minus by one because org. labeler do not count

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_generics']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with org. data

### Assigning a running count of substitutes labelers

Unpacking pharm class to obtain EPC and MoA

In [15]:
df_temp = pd.DataFrame(df_reg[['unique_id', 'pharm_class']])

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'pharm_class'])

df_temp['pharm_class'] = df_temp['pharm_class'].apply(literal_eval)
df_temp = df_temp.explode('pharm_class')

df_temp['pharm_class_type'] = df_temp['pharm_class'].apply(lambda x: str(x)[-5:])
df_temp['pharm_class_type'] = df_temp['pharm_class_type'].str.replace(r'[][]', '', regex=True)

df_temp_EPC = df_temp[df_temp['pharm_class_type'] == 'EPC']
df_temp_MoA = df_temp[df_temp['pharm_class_type'] == 'MoA']

df_temp_EPC = df_temp_EPC.rename(columns = {"pharm_class": "pharm_class_EPC"})
df_temp_MoA = df_temp_MoA.rename(columns = {"pharm_class": "pharm_class_MoA"})

df_temp_EPC = df_temp_EPC.drop_duplicates(subset = ['unique_id', 'pharm_class_EPC'])
df_temp_MoA = df_temp_MoA.drop_duplicates(subset = ['unique_id', 'pharm_class_MoA'])

df_pharm_class_type = pd.merge(df_temp_EPC, df_temp_MoA,  on = 'unique_id', how = 'left')

Adding a route

In [16]:
df_openFDA_NDC = pd.read_csv('raw_openFDA_NDC_data.csv', sep = '~')

In [17]:
df_temp = df_openFDA_NDC.drop_duplicates(subset = ['unique_id'])
df_pharm_class_type = pd.merge(df_pharm_class_type, df_temp[['unique_id', 'route']],  on = 'unique_id', how = 'left')

Creating a unique id for substitutes

In [18]:
df_pharm_class_type['unique_substitute'] = df_pharm_class_type['pharm_class_EPC'].astype(str) + '-' + df_pharm_class_type['route'].astype(str) + '-' + df_pharm_class_type['pharm_class_MoA'].astype(str)
df_pharm_class_type = df_pharm_class_type[['unique_id', 'unique_substitute']]

df_pharm_class_type = df_pharm_class_type.drop_duplicates(subset = ['unique_id'])

Merging with org. data

In [19]:
df_reg = pd.merge(df_reg, df_pharm_class_type[['unique_id', 'unique_substitute']],  on = 'unique_id', how = 'outer')

Calculating number of substitutes

In [20]:
df_temp = df_reg.groupby(['date', 'unique_substitute'])['unique_id'].count()
df_temp = df_temp.reset_index()

df_temp['running_count_unique_substitute'] = df_temp['unique_id'] - 1

df_reg = pd.merge(df_reg, df_temp[['date', 'unique_substitute', 'running_count_unique_substitute']],  on = ['date', 'unique_substitute'], how = 'left')

### Calculating the quarterly total amount reimbursed and units reimbursed for all labelers

In [21]:
df_temp = df_reg.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "units_reimbursed_sum", "total_amount_reimbursed_adj": "total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'units_reimbursed_sum', 'total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Calculating the generic labelers' share of quarterly total amount reimbursed and units reimbursed

In [22]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "generic_units_reimbursed_sum", "total_amount_reimbursed_adj": "generic_total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'generic_units_reimbursed_sum', 'generic_total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['generic_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'].fillna(0)
df_reg['generic_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'].fillna(0)

In [23]:
df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'] / df_reg['units_reimbursed_sum']
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'] / df_reg['total_amount_reimbursed_adj_sum']

df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_share_units_reimbursed_sum'].fillna(0)
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_share_total_amount_reimbursed_adj_sum'].fillna(0)

# Check for issue

In [24]:
print(len(df_reg))
df_reg_issue = df_reg.drop_duplicates(subset = ['unique_id', 'date', 'labeler_name'])
print(len(df_reg))

225135
225135


# Downloading Data

In [25]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

# Loading Data

In [3]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

# Regression

In [4]:
df_reg.head(5)

Unnamed: 0,unique_id,number_of_first_entrance,date,running_labeler_name_count,first_nda,labeler_name,labeler_name_count,units_reimbursed_sum,total_amount_reimbursed_adj_sum,marketing_category,brand_name,generic_name,is_original_packager,units_reimbursed,total_amount_reimbursed_adj,price_per_unit_adj
0,acetic acid-irrigant-irrigation-.25 g/100ml,1.0,1991-01-01,1,1.0,B. Braun Medical Inc.,1,3858041.0,193466.638618,NDA,Acetic Acid,Acetic Acid,True,3858041.0,193466.638618,0.050146
1,acetic acid-irrigant-irrigation-.25 g/100ml,1.0,1991-04-01,2,1.0,B. Braun Medical Inc.,1,4326137.0,234952.298144,NDA,Acetic Acid,Acetic Acid,True,4326137.0,234952.298144,0.05431
2,acetic acid-irrigant-irrigation-.25 g/100ml,1.0,1991-07-01,3,1.0,B. Braun Medical Inc.,1,9909398.0,430801.344901,NDA,Acetic Acid,Acetic Acid,True,9909398.0,430801.344901,0.043474
3,acetic acid-irrigant-irrigation-.25 g/100ml,1.0,1991-10-01,4,1.0,B. Braun Medical Inc.,1,1740481.764,311411.174427,NDA,Acetic Acid,Acetic Acid,True,1740481.764,311411.174427,0.178922
4,acetic acid-irrigant-irrigation-.25 g/100ml,1.0,1992-01-01,5,1.0,B. Braun Medical Inc.,1,6773146.0,393679.63957,NDA,Acetic Acid,Acetic Acid,True,6773146.0,393679.63957,0.058124


In [5]:
df_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229525 entries, 0 to 229524
Data columns (total 16 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   unique_id                        229525 non-null  object 
 1   number_of_first_entrance         229525 non-null  float64
 2   date                             229525 non-null  object 
 3   running_labeler_name_count       229525 non-null  int64  
 4   first_nda                        229525 non-null  float64
 5   labeler_name                     229525 non-null  object 
 6   labeler_name_count               229525 non-null  int64  
 7   units_reimbursed_sum             229525 non-null  float64
 8   total_amount_reimbursed_adj_sum  229525 non-null  float64
 9   marketing_category               229525 non-null  object 
 10  brand_name                       229525 non-null  object 
 11  generic_name                     229525 non-null  object 
 12  is

In [6]:
df_temp = df_reg

Assign a dummy for second entrance

In [7]:
df_temp['second_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 2 else 0)
df_temp['second_entrance'] = df_temp['second_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [8]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Change the format of the date to a integer

In [9]:
df_temp['date'] = df_temp['date'].str.replace('-', '')
df_temp['date'] = df_temp['date'].apply(pd.to_numeric)

In [10]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

<p style="color:red">Possible issue: Should we remove unique drugs that do not have a second entrance?</p>

In [11]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0005
Estimator:                  PooledOLS   R-squared (Between):             -0.2420
No. Observations:              229525   R-squared (Within):              -0.0015
Date:                Wed, Mar 09 2022   R-squared (Overall):              0.0005
Time:                        09:39:24   Log-likelihood                -7.596e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      119.34
Entities:                        3582   P-value                           0.0000
Avg Obs:                       64.077   Distribution:                F(1,229523)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             119.34
                            

In [60]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum', 'running_count_from_start']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0308
Estimator:                  PooledOLS   R-squared (Between):              0.1133
No. Observations:              229525   R-squared (Within):               0.0210
Date:                Tue, Mar 08 2022   R-squared (Overall):              0.0308
Time:                        09:33:41   Log-likelihood                -7.242e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      3651.6
Entities:                        3582   P-value                           0.0000
Avg Obs:                       64.077   Distribution:                F(2,229522)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             3651.6
                            