# Libraries

In [43]:
import requests

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # removing some warnings
pd.set_option('display.max_columns', None) # display all columns in DF


import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import seaborn as sns

from ast import literal_eval

import dtale

import time
import datetime as dt

from linearmodels import PooledOLS
from linearmodels import PanelOLS
import statsmodels.api as sm

# Loading Data

In [44]:
df_reg_org = pd.read_csv('raw_consolidation_data.csv', sep = '~')

In [45]:
df_temp = df_reg_org.loc[df_reg_org['is_original_packager'] == True]

In [46]:
df_temp = df_reg_org

In [47]:
len(df_temp)

523938

In [48]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total: ', df_temp['unique_id'].nunique())
print('Total (generic_name): ', df_temp['generic_name'].nunique())
print('Total (labeler): ', df_temp['labeler_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01        567
1  1991-04-01        591
2  1991-07-01        604
3  1991-10-01        606
4  1992-01-01        637
Quarterly Average:  4330.06611570248
Total:  7214
Total (generic_name):  2117
Total (labeler):  839


# Data manipulation

Removing labeller that are original packager

In [49]:
print(len(df_reg_org))
df_reg = df_reg_org.loc[df_reg_org['is_original_packager'] == True]
print(len(df_reg))

523938
499057


### Assigning order of entrance for each unique drug

In [50]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date', 'marketing_category'], ascending = [False, True, False], ignore_index = True) # Sorting

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'labeler_name'], keep = 'first') # Finding the first entrance for each labeller 

df_temp['labeler_name_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['unique_id', 'labeler_name', 'labeler_name_count']], df_reg, on = ['unique_id', 'labeler_name'], how = 'right') # Merging with org. data

In [51]:
print(len(df_reg))

499057


### Assigning a dummy for first entrance and NDA

In [52]:
df_temp = df_reg.loc[(df_reg['labeler_name_count'] == 1) & (df_reg['marketing_category'] == 'NDA')] # Filtering

df_temp = df_temp.sort_values(by = ['date', 'unique_id'], ascending = [True, False], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['unique_id'], keep = 'first') # Removing dublicates

df_temp['first_nda'] = 1 # Adding dummy

df_reg = pd.merge(df_temp[['unique_id', 'first_nda']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['first_nda'] = df_reg['first_nda'].fillna(0)

In [53]:
print(len(df_reg))

499057


Removing unique drug where the first entrance is not a NDA

In [54]:
df_reg = df_reg.loc[(df_reg['first_nda'] == 1)]

In [55]:
len(df_reg)

202233

### Assigning a running count for each unique drug from the start

In [56]:
df_temp = df_reg.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_start'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_start']], df_reg, on = ['date', 'unique_id',], how = 'right')

In [57]:
len(df_reg)

202233

### Assigning a running count for each unique drug from second entrance

In [58]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_second_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_second_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_second_entrance'] = df_reg['running_count_from_second_entrance'].fillna(0)

In [59]:
len(df_reg)

202233

### Assigning a running count for each unique drug from third entrance

In [60]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 3]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_third_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_third_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_third_entrance'] = df_reg['running_count_from_third_entrance'].fillna(0)

In [61]:
len(df_reg)

202233

### Assigning a running count for each unique drug from fourth entrance

In [62]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 4]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_fourth_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_fourth_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_fourth_entrance'] = df_reg['running_count_from_fourth_entrance'].fillna(0)

In [63]:
len(df_reg)

202233

### Assigning a total count of the labelers at the start (to exclude unique drug with multiple labelers from the start)

In [64]:
df_temp = df_reg.loc[(df_reg['running_count_from_start'] == 1)]
          
df_temp = df_temp.groupby(['unique_id'])['first_nda'].sum()
df_temp = df_temp.reset_index()
df_temp = df_temp.rename(columns = {"first_nda": "number_of_first_entrance"})

df_temp = df_temp.loc[(df_temp['number_of_first_entrance'] > 1)]

df_reg = pd.merge(df_temp[['unique_id', 'number_of_first_entrance']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['number_of_first_entrance'] = df_reg['number_of_first_entrance'].fillna(1)

Removing unique drug which has multiple entrances at start

In [65]:
df_reg = df_reg.loc[(df_reg['number_of_first_entrance'] == 1)]

In [66]:
len(df_reg)

193557

### Assigning a running count of generic labelers

In [67]:
df_temp = df_reg[['date', 'unique_id', 'labeler_name', 'labeler_name_count']]

df_temp = df_temp.groupby(['date', 'unique_id'])

df_temp = df_temp.max()

df_temp = df_temp.reset_index()

df_temp['running_count_generics'] = df_temp['labeler_name_count'] - 1 # Minus by one because org. labeler do not count

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_generics']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with org. data

### Assigning a running count of substitutes labelers

Unpacking pharm class to obtain EPC and MoA

In [68]:
df_temp = pd.DataFrame(df_reg[['unique_id', 'pharm_class']])

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'pharm_class'])

df_temp = df_temp.dropna(subset = ['pharm_class'])
df_temp['pharm_class'] = df_temp['pharm_class'].apply(literal_eval)
df_temp = df_temp.explode('pharm_class')

df_temp['pharm_class_type'] = df_temp['pharm_class'].apply(lambda x: str(x)[-5:])
df_temp['pharm_class_type'] = df_temp['pharm_class_type'].str.replace(r'[][]', '', regex=True)

df_temp_EPC = df_temp[df_temp['pharm_class_type'] == 'EPC']
df_temp_MoA = df_temp[df_temp['pharm_class_type'] == 'MoA']

df_temp_EPC = df_temp_EPC.rename(columns = {"pharm_class": "pharm_class_EPC"})
df_temp_MoA = df_temp_MoA.rename(columns = {"pharm_class": "pharm_class_MoA"})

df_temp_EPC = df_temp_EPC.drop_duplicates(subset = ['unique_id', 'pharm_class_EPC'])
df_temp_MoA = df_temp_MoA.drop_duplicates(subset = ['unique_id', 'pharm_class_MoA'])

df_pharm_class_type = pd.merge(df_temp_EPC, df_temp_MoA,  on = 'unique_id', how = 'left')

df_pharm_class_type = df_pharm_class_type.dropna(subset = ['pharm_class_EPC'])
df_pharm_class_type = df_pharm_class_type.dropna(subset = ['pharm_class_MoA'])

Adding a route

In [69]:
df_openFDA_NDC = pd.read_csv('raw_openFDA_NDC_data.csv', sep = '~')

In [70]:
df_temp = df_openFDA_NDC.drop_duplicates(subset = ['unique_id'])

df_temp = df_temp.dropna(subset = ['route'])

df_pharm_class_type = pd.merge(df_pharm_class_type, df_temp[['unique_id', 'route']],  on = 'unique_id', how = 'left')

Creating a unique id for substitutes

In [71]:
df_pharm_class_type['unique_substitute'] = df_pharm_class_type['pharm_class_EPC'].astype(str) + '-' + df_pharm_class_type['route'].astype(str) + '-' + df_pharm_class_type['pharm_class_MoA'].astype(str)
df_pharm_class_type = df_pharm_class_type[['unique_id', 'unique_substitute']]

df_pharm_class_type = df_pharm_class_type.drop_duplicates(subset = ['unique_id'])

Merging with org. data

In [72]:
df_reg = pd.merge(df_reg, df_pharm_class_type[['unique_id', 'unique_substitute']],  on = 'unique_id', how = 'left')

Calculating number of substitutes

In [73]:
df_temp = df_reg.groupby(['date', 'unique_substitute'])['unique_id'].agg('nunique')
df_temp = df_temp.reset_index()

df_temp['running_count_unique_substitute'] = df_temp['unique_id'] - 1

df_reg = pd.merge(df_reg, df_temp[['date', 'unique_substitute', 'running_count_unique_substitute']],  on = ['date', 'unique_substitute'], how = 'left')

### Assigning a running count where the first entrance is the starting point

Finding the running count from the start for unique drugs

In [74]:
df_temp = df_reg[['date', 'unique_id', 'running_count_from_start', 'running_count_from_second_entrance']]

df_temp['col_temp'] = df_temp['running_count_from_second_entrance'].map(lambda x: True if (x == 1.0)  else False)

df_temp = df_temp[df_temp['col_temp'] == True]
df_temp = df_temp.drop(columns=['col_temp'])
df_temp = df_temp.drop_duplicates(subset=['unique_id'])
df_temp = df_temp.rename(columns = {"running_count_from_start": "col_temp"})
df_temp = df_temp[['unique_id', 'col_temp']]

Merging with org. data

In [75]:
df_reg = pd.merge(df_temp[['unique_id', 'col_temp']], df_reg, on = ['unique_id',], how = 'right')

Calculationg the running count

In [76]:
df_reg['col_temp'] = df_reg['col_temp'].fillna(0)
df_reg['col_temp'] = df_reg['running_count_from_start'] - df_reg['col_temp']

df_reg['running_count_event'] = df_reg['col_temp'].where(df_reg['col_temp'] != df_reg['running_count_from_start'])

df_reg = df_reg.drop(columns=['col_temp'])

In [77]:
len(df_reg)

193557

### Assigning a dummy for the number of quarter before and after first entrance (running_count_event)

In [78]:
df_temp = df_reg.groupby(['unique_id']).agg({'running_count_event': [np.min,np.max]})
df_temp = df_temp.reset_index()
df_temp = df_temp.droplevel(0, axis=1) 

df_temp = df_temp.rename(columns = {"": "unique_id", "amin": "min_quarter_before_second_entrance", "amax": "max_quarter_before_second_entrance"})
df_temp = df_temp.drop_duplicates(subset=['unique_id'])

Merging with org. data

In [79]:
df_reg = pd.merge(df_temp[['unique_id', 'min_quarter_before_second_entrance', 'max_quarter_before_second_entrance']], df_reg, on = ['unique_id',], how = 'right')

### Assigning a dummy for 2006 (change in Medicare which affect Medicaid)

In [80]:
df_reg['dummy_2006'] = df_reg['year'].apply(lambda x: 1 if x == 2006 else 0)

### Calculating the quarterly total amount reimbursed and units reimbursed for all labelers

In [81]:
df_temp = df_reg.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "units_reimbursed_sum", "total_amount_reimbursed_adj": "total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'units_reimbursed_sum', 'total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Calculating the generic labelers' share of quarterly total amount reimbursed and units reimbursed

In [82]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "generic_units_reimbursed_sum", "total_amount_reimbursed_adj": "generic_total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'generic_units_reimbursed_sum', 'generic_total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['generic_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'].fillna(0)
df_reg['generic_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'].fillna(0)

In [83]:
df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'] / df_reg['units_reimbursed_sum']
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'] / df_reg['total_amount_reimbursed_adj_sum']

df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_share_units_reimbursed_sum'].fillna(0)
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_share_total_amount_reimbursed_adj_sum'].fillna(0)

# Check for issue

...

# Downloading Data

In [84]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

# Loading Data

In [148]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

# Descriptive of Data

Counting number of unique drugs

In [149]:
df_temp = df_reg

In [282]:
len(df_reg)

193557

In [151]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total: ', df_temp['unique_id'].nunique())
print('Total (generic_name): ', df_temp['generic_name'].nunique())
print('Total (labeler): ', df_temp['labeler_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01        148
1  1991-04-01        159
2  1991-07-01        165
3  1991-10-01        165
4  1992-01-01        181
Quarterly Average:  1599.6446280991736
Total:  2886
Total (generic_name):  1155
Total (labeler):  595


# Regression for generic paradox

## For org. labeler

In [364]:
df_temp = df_reg
len(df_temp)

193557

Filtering on orig. labeler

In [365]:
df_temp = df_temp[df_temp['labeler_name_count'] == 1]
len(df_temp)

118723

In [366]:
# Removing data NaN in running_count_event
df_temp = df_temp.dropna(subset = ['running_count_event'])

# Select the min. number of quarter before and after the entrance entrance
df_temp = df_temp[df_temp['min_quarter_before_second_entrance'] <= -10] # including drugs with 10 or more quarters 
df_temp = df_temp[df_temp['max_quarter_before_second_entrance'] >= 10]

len(df_temp)

48035

Calculating the log of price per unit

In [367]:
df_temp['ln_price_per_unit_adj'] = np.log(df_temp['price_per_unit_adj'])

Change the format of the date to a integer

In [368]:
df_temp['date_int'] = df_temp['date'].str.replace('-', '')
df_temp['date_int'] = df_temp['date_int'].apply(pd.to_numeric)

Setting the index

In [369]:
df_OLS_ndc_gp = df_temp.set_index(['unique_id', 'date_int'])

Estimating the regression

In [374]:
endog = df_OLS_ndc_gp['ln_price_per_unit_adj']
exog_vars = ['running_count_event']
exog = sm.add_constant(df_OLS_ndc_gp[exog_vars])

mod = PanelOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                            PanelOLS Estimation Summary                            
Dep. Variable:     ln_price_per_unit_adj   R-squared:                     9.495e-06
Estimator:                      PanelOLS   R-squared (Between):             -0.0067
No. Observations:                  48035   R-squared (Within):              -0.0021
Date:                   Thu, Mar 24 2022   R-squared (Overall):           9.495e-06
Time:                           23:19:30   Log-likelihood                -8.902e+04
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      0.4561
Entities:                            696   P-value                           0.4995
Avg Obs:                          69.016   Distribution:                 F(1,48033)
Min Obs:                          11.000                                           
Max Obs:                          121.00   F-statistic (robust):            

# Regression for probability of entrance

### Second entrance

In [123]:
df_temp = df_reg

Assign a dummy for second entrance

In [124]:
df_temp['second_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 2 else 0)
df_temp['second_entrance'] = df_temp['second_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [125]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Change the format of the date to a integer

In [126]:
df_temp['date'] = df_temp['date'].str.replace('-', '')
df_temp['date'] = df_temp['date'].apply(pd.to_numeric)

Setting index

In [127]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

In [128]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                      4.81e-07
Estimator:                  PooledOLS   R-squared (Between):             -0.2692
No. Observations:              193557   R-squared (Within):            4.417e-05
Date:                Tue, Mar 22 2022   R-squared (Overall):            4.81e-07
Time:                        21:50:43   Log-likelihood                 -6.99e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.0931
Entities:                        2886   P-value                           0.7603
Avg Obs:                       67.068   Distribution:                F(1,193555)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             0.0931
                            

### Third entrance

In [54]:
df_temp = df_reg

Assign a dummy for second entrance

In [55]:
df_temp['third_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 3 else 0)
df_temp['third_entrance'] = df_temp['third_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [56]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Setting index

In [57]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

In [58]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0011
Estimator:                  PooledOLS   R-squared (Between):             -0.2013
No. Observations:              225135   R-squared (Within):              -0.0011
Date:                Fri, Mar 18 2022   R-squared (Overall):              0.0011
Time:                        12:24:48   Log-likelihood                -7.836e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      242.12
Entities:                        3012   P-value                           0.0000
Avg Obs:                       74.746   Distribution:                F(1,225133)
Min Obs:                       1.0000                                           
Max Obs:                       1255.0   F-statistic (robust):             242.12
                            