# Libraries

In [27]:
import requests

import numpy as np
import pandas as pd

import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval

import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdate

from linearmodels import PooledOLS
import statsmodels.api as sm

pd.set_option('display.max_columns', None) # display all columns in DF

# Loading Data

In [28]:
df_reg_org = pd.read_csv('raw_consolidation_data.csv', sep = '~')

In [29]:
df_reg_org.head()

Unnamed: 0,date,unique_id,marketing_category,brand_name,generic_name,labeler_name,is_original_packager,units_reimbursed,total_amount_reimbursed_adj,price_per_unit_adj
0,1991-01-01,acetaminophen and codeine phosphate-tablet-ora...,ANDA,Acetaminophen and Codeine Phosphate,Acetaminophen and Codeine Phosphate,"Teva Pharmaceuticals USA, Inc.",True,386318.3,231166.2,0.598383
1,1991-01-01,acetaminophen and codeine phosphate-tablet-ora...,ANDA,Acetaminophen and Codeine Phosphate,Acetaminophen and Codeine Phosphate,"Teva Pharmaceuticals USA, Inc.",True,6832267.0,4071327.0,0.595897
2,1991-01-01,acetaminophen and codeine phosphate-tablet-ora...,ANDA,Acetaminophen and Codeine Phosphate,Acetaminophen and Codeine Phosphate,"Teva Pharmaceuticals USA, Inc.",True,2599944.0,1554268.0,0.604724
3,1991-01-01,acetaminophen and codeine phosphate-tablet-ora...,ANDA,Acetaminophen and Codeine Phosphate,Acetaminophen and Codeine Phosphate,"Teva Pharmaceuticals USA, Inc.",True,581248.3,360311.7,0.619893
4,1991-01-01,acetazolamide-tablet-oral-250 mg/1,ANDA,AcetaZOLAMIDE,Acetazolamide,"Lannett Company, Inc.",True,840.0,345.0244,0.410743


### Calculating the quarterly total amount reimbursed

In [30]:
df_temp = df_reg_org.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "units_reimbursed_sum", "total_amount_reimbursed_adj": "total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'units_reimbursed_sum', 'total_amount_reimbursed_adj_sum']], df_reg_org, on = ['date', 'unique_id',], how = 'right')

Removing labeller that are original packager

In [31]:
df_reg = df_reg.loc[df_reg['is_original_packager'] == True]

# Data manipulation

### Assigning order of entrance for each unique drug

In [32]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date', 'marketing_category'], ascending = [False, True, False], ignore_index = True) # Sorting

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'labeler_name'], keep = 'first') # Finding the first entrance for each labeller 

df_temp['labeler_name_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['unique_id', 'labeler_name', 'labeler_name_count']], df_reg, on = ['unique_id', 'labeler_name'], how = 'right') # Merging with org. data

### Assigning a dummy for first entrance and NDA

In [33]:
df_temp = df_reg.loc[(df_reg['labeler_name_count'] == 1) & (df_reg['marketing_category'] == 'NDA')] # Filtering

df_temp = df_temp.sort_values(by = ['date', 'unique_id'], ascending = [True, False], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['unique_id'], keep = 'first') # Removing dublicates

df_temp['first_nda'] = 1 # Adding dummy

df_reg = pd.merge(df_temp[['unique_id', 'first_nda']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['first_nda'] = df_reg['first_nda'].fillna(0)

Removing unique drug where the first entrance is not a NDA

In [34]:
df_reg = df_reg.loc[(df_reg['first_nda'] == 1)]

### Assigning a running count for each unique drug

In [35]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date'], ascending = [False, True], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with org. data

### Assigning a total count of the first entrance (to exclude unique drug with multiple entrances)

In [36]:
df_temp = df_reg.loc[(df_reg['running_count'] == 1)]
          
df_temp = df_temp.groupby(['unique_id'])['first_nda'].sum()
df_temp = df_temp.reset_index()
df_temp = df_temp.rename(columns = {"first_nda": "number_of_first_entrance"})

df_temp = df_temp.loc[(df_temp['number_of_first_entrance'] > 1)]

df_reg = pd.merge(df_temp[['unique_id', 'number_of_first_entrance']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['number_of_first_entrance'] = df_reg['number_of_first_entrance'].fillna(1)

Removing unique drug which has multiple entrances

In [37]:
df_reg = df_reg.loc[(df_reg['number_of_first_entrance'] == 1)]

# Check for issue

In [38]:
print(len(df_reg))
df_reg_issue = df_reg.drop_duplicates(subset = ['unique_id', 'date', 'labeler_name'])
print(len(df_reg))

229525
229525


# Downloading Data

In [17]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

# Loading Data

In [373]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

# Regression

In [51]:
df_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229525 entries, 0 to 239153
Data columns (total 17 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   unique_id                        229525 non-null  object 
 1   number_of_first_entrance         229525 non-null  float64
 2   date                             229525 non-null  object 
 3   running_count                    229525 non-null  int64  
 4   first_nda                        229525 non-null  float64
 5   labeler_name                     229525 non-null  object 
 6   labeler_name_count               229525 non-null  int64  
 7   units_reimbursed_sum             229525 non-null  float64
 8   total_amount_reimbursed_adj_sum  229525 non-null  float64
 9   marketing_category               229525 non-null  object 
 10  brand_name                       229525 non-null  object 
 11  generic_name                     229525 non-null  object 
 12  is

In [52]:
df_temp = df_reg

Assign a dummy for second entrance

In [53]:
df_temp['second_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 2 else 0)
df_temp['second_entrance'] = df_temp['second_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [54]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Change the format of the date to a integer

In [55]:
df_temp['date'] = df_temp['date'].str.replace('-', '')
df_temp['date'] = df_temp['date'].apply(pd.to_numeric)

In [56]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

<p style="color:red">Possible issue: Should we remove unique drugs that do not have a second entrance?</p>

In [59]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0005
Estimator:                  PooledOLS   R-squared (Between):             -0.2420
No. Observations:              229525   R-squared (Within):              -0.0015
Date:                Tue, Mar 08 2022   R-squared (Overall):              0.0005
Time:                        09:25:55   Log-likelihood                -7.596e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      119.34
Entities:                        3582   P-value                           0.0000
Avg Obs:                       64.077   Distribution:                F(1,229523)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             119.34
                            

In [60]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum', 'running_count']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0308
Estimator:                  PooledOLS   R-squared (Between):              0.1133
No. Observations:              229525   R-squared (Within):               0.0210
Date:                Tue, Mar 08 2022   R-squared (Overall):              0.0308
Time:                        09:33:41   Log-likelihood                -7.242e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      3651.6
Entities:                        3582   P-value                           0.0000
Avg Obs:                       64.077   Distribution:                F(2,229522)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             3651.6
                            