# Libraries

In [64]:
import requests

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # removing some warnings
pd.set_option('display.max_columns', None) # display all columns in DF


import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import seaborn as sns

from ast import literal_eval

import time
import datetime as dt

from linearmodels import PooledOLS
import statsmodels.api as sm

# Loading Data

In [65]:
df_reg_org = pd.read_csv('raw_consolidation_data.csv', sep = '~')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Data manipulation

Removing labeller that are original packager

In [66]:
print(len(df_reg_org))
df_reg = df_reg_org.loc[df_reg_org['is_original_packager'] == True]
print(len(df_reg))

523938
499057


### Assigning order of entrance for each unique drug

In [67]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date', 'marketing_category'], ascending = [False, True, False], ignore_index = True) # Sorting

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'labeler_name'], keep = 'first') # Finding the first entrance for each labeller 

df_temp['labeler_name_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['unique_id', 'labeler_name', 'labeler_name_count']], df_reg, on = ['unique_id', 'labeler_name'], how = 'right') # Merging with org. data

### Assigning a dummy for first entrance and NDA

In [68]:
df_temp = df_reg.loc[(df_reg['labeler_name_count'] == 1) & (df_reg['marketing_category'] == 'NDA')] # Filtering

df_temp = df_temp.sort_values(by = ['date', 'unique_id'], ascending = [True, False], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['unique_id'], keep = 'first') # Removing dublicates

df_temp['first_nda'] = 1 # Adding dummy

df_reg = pd.merge(df_temp[['unique_id', 'first_nda']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['first_nda'] = df_reg['first_nda'].fillna(0)

Removing unique drug where the first entrance is not a NDA

In [69]:
df_reg = df_reg.loc[(df_reg['first_nda'] == 1)]

### Assigning a running count for each unique drug from the start

In [70]:
df_temp = df_reg[df_reg['labeler_name_count'] == 1]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_start'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_start']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Assigning a running count for each unique drug from second entrance

In [71]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_second_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_second_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_second_entrance'] = df_reg['running_count_from_second_entrance'].fillna(0)

### Assigning a running count for each unique drug from third entrance

In [72]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 3]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_third_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_third_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_third_entrance'] = df_reg['running_count_from_third_entrance'].fillna(0)

### Assigning a running count for each unique drug from fourth entrance

In [73]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 4]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id'], keep = 'first') # Removing dublicates

df_temp['running_count_from_fourth_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_fourth_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_fourth_entrance'] = df_reg['running_count_from_fourth_entrance'].fillna(0)

### Assigning a total count of the labelers at the start (to exclude unique drug with multiple labelers from the start)

In [74]:
df_temp = df_reg.loc[(df_reg['running_count_from_start'] == 1)]
          
df_temp = df_temp.groupby(['unique_id'])['first_nda'].sum()
df_temp = df_temp.reset_index()
df_temp = df_temp.rename(columns = {"first_nda": "number_of_first_entrance"})

df_temp = df_temp.loc[(df_temp['number_of_first_entrance'] > 1)]

df_reg = pd.merge(df_temp[['unique_id', 'number_of_first_entrance']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['number_of_first_entrance'] = df_reg['number_of_first_entrance'].fillna(1)

Removing unique drug which has multiple entrances at start

In [75]:
df_reg = df_reg.loc[(df_reg['number_of_first_entrance'] == 1)]

### Assigning a running count of generic labelers

In [76]:
df_temp = df_reg[['date', 'unique_id', 'labeler_name', 'labeler_name_count']]

df_temp = df_temp.groupby(['date', 'unique_id'])

df_temp = df_temp.max()

df_temp = df_temp.reset_index()

df_temp['running_count_generics'] = df_temp['labeler_name_count'] - 1 # Minus by one because org. labeler do not count

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_generics']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with org. data

### Assigning a running count of substitutes labelers

Unpacking pharm class to obtain EPC and MoA

In [77]:
df_temp = pd.DataFrame(df_reg[['unique_id', 'pharm_class']])

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'pharm_class'])

#df_temp['pharm_class'] = df_temp['pharm_class'].apply(literal_eval)
df_temp = df_temp.explode('pharm_class')

df_temp['pharm_class_type'] = df_temp['pharm_class'].apply(lambda x: str(x)[-5:])
df_temp['pharm_class_type'] = df_temp['pharm_class_type'].str.replace(r'[][]', '', regex=True)

df_temp_EPC = df_temp[df_temp['pharm_class_type'] == 'EPC']
df_temp_MoA = df_temp[df_temp['pharm_class_type'] == 'MoA']

df_temp_EPC = df_temp_EPC.rename(columns = {"pharm_class": "pharm_class_EPC"})
df_temp_MoA = df_temp_MoA.rename(columns = {"pharm_class": "pharm_class_MoA"})

df_temp_EPC = df_temp_EPC.drop_duplicates(subset = ['unique_id', 'pharm_class_EPC'])
df_temp_MoA = df_temp_MoA.drop_duplicates(subset = ['unique_id', 'pharm_class_MoA'])

df_pharm_class_type = pd.merge(df_temp_EPC, df_temp_MoA,  on = 'unique_id', how = 'left')

Adding a route

In [78]:
df_openFDA_NDC = pd.read_csv('raw_openFDA_NDC_data.csv', sep = '~')

In [79]:
df_temp = df_openFDA_NDC.drop_duplicates(subset = ['unique_id'])
df_pharm_class_type = pd.merge(df_pharm_class_type, df_temp[['unique_id', 'route']],  on = 'unique_id', how = 'left')

Creating a unique id for substitutes

In [80]:
df_pharm_class_type['unique_substitute'] = df_pharm_class_type['pharm_class_EPC'].astype(str) + '-' + df_pharm_class_type['route'].astype(str) + '-' + df_pharm_class_type['pharm_class_MoA'].astype(str)
df_pharm_class_type = df_pharm_class_type[['unique_id', 'unique_substitute']]

df_pharm_class_type = df_pharm_class_type.drop_duplicates(subset = ['unique_id'])

Merging with org. data

In [81]:
df_reg = pd.merge(df_reg, df_pharm_class_type[['unique_id', 'unique_substitute']],  on = 'unique_id', how = 'outer')

Calculating number of substitutes

In [82]:
df_temp = df_reg.groupby(['date', 'unique_substitute'])['unique_id'].count()
df_temp = df_temp.reset_index()

df_temp['running_count_unique_substitute'] = df_temp['unique_id'] - 1

df_reg = pd.merge(df_reg, df_temp[['date', 'unique_substitute', 'running_count_unique_substitute']],  on = ['date', 'unique_substitute'], how = 'left')

### Assigning a running count where the first entrance is the starting point

Finding the running count from the start for unique drugs

In [83]:
df_temp = df_reg[['date', 'unique_id', 'running_count_from_start', 'running_count_from_second_entrance']]

df_temp['col_temp'] = df_temp['running_count_from_second_entrance'].map(lambda x: True if (x == 1.0)  else False)

df_temp = df_temp[df_temp['col_temp'] == True]
df_temp = df_temp.drop(columns=['col_temp'])
df_temp = df_temp.drop_duplicates(subset=['unique_id'])
df_temp = df_temp.rename(columns = {"running_count_from_start": "col_temp"})
df_temp = df_temp[['unique_id', 'col_temp']]

Merging with org. data

In [84]:
df_reg = pd.merge(df_temp[['unique_id', 'col_temp']], df_reg, on = ['unique_id',], how = 'right')

Calculationg the running count

In [85]:
df_reg['col_temp'] = df_reg['col_temp'].fillna(0)
df_reg['col_temp'] = df_reg['running_count_from_start'] - df_reg['col_temp']

df_reg['running_count_event'] = df_reg['col_temp'].where(df_reg['col_temp'] != df_reg['running_count_from_start'])

df_temp = df_temp.drop(columns=['col_temp'])

### Calculating the quarterly total amount reimbursed and units reimbursed for all labelers

In [86]:
df_temp = df_reg.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "units_reimbursed_sum", "total_amount_reimbursed_adj": "total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'units_reimbursed_sum', 'total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Calculating the generic labelers' share of quarterly total amount reimbursed and units reimbursed

In [87]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "generic_units_reimbursed_sum", "total_amount_reimbursed_adj": "generic_total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'generic_units_reimbursed_sum', 'generic_total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['generic_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'].fillna(0)
df_reg['generic_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'].fillna(0)

In [88]:
df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'] / df_reg['units_reimbursed_sum']
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'] / df_reg['total_amount_reimbursed_adj_sum']

df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_share_units_reimbursed_sum'].fillna(0)
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_share_total_amount_reimbursed_adj_sum'].fillna(0)

# Check for issue

In [89]:
print(len(df_reg))
df_reg_issue = df_reg.drop_duplicates(subset = ['unique_id', 'date', 'labeler_name'])
print(len(df_reg))

193557
193557


# Downloading Data

In [90]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

# Loading Data

In [91]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [92]:
df_reg.head()

Unnamed: 0,date,unique_id,generic_units_reimbursed_sum,generic_total_amount_reimbursed_adj_sum,units_reimbursed_sum,total_amount_reimbursed_adj_sum,col_temp,running_count_generics,number_of_first_entrance,running_count_from_fourth_entrance,running_count_from_third_entrance,running_count_from_second_entrance,running_count_from_start,first_nda,labeler_name,labeler_name_count,dataset,labeler_code,product_code,year,quarter,product_ndc,generic_name,brand_name,finished,packaging,listing_expiration_date,openfda,marketing_category,dosage_form,spl_id,product_type,marketing_start_date,product_id,application_number,brand_name_base,marketing_end_date,active_ingredients,route,pharm_class,brand_name_suffix,dea_schedule,is_original_packager,name,strength,NDA #,Alimentary Tract And Metabolism,Antiinfectives For Systemic Use,Antineoplastic And Immunomodulating Agents,"Antiparasitic Products, Insecticides And Repellents",Blood And Blood Forming Organs,Cardiovascular System,Dermatologicals,Genito Urinary System And Sex Hormones,Musculo-Skeletal System,Nervous System,Respiratory System,Sensory Organs,"Systemic Hormonal Preparations, Excl. Sex Hormones And Insulins",Various,units_reimbursed,total_amount_reimbursed_adj,price_per_unit_adj,unique_substitute,running_count_unique_substitute,running_count_event,generic_share_units_reimbursed_sum,generic_share_total_amount_reimbursed_adj_sum
0,2020-10-01,"(calcium, magnesium, potassium, and sodium oxy...",0.0,0.0,38610.0,1143621.0,1.0,0,1.0,0.0,0.0,0.0,1.0,1.0,"Jazz Pharmaceuticals, Inc.",1,State Drug Utilization Data 2020,68727,150,2020,4,68727-150,"(CALCIUM, MAGNESIUM, POTASSIUM, AND SODIUM OXY...",XYWAV,True,"[{'package_ndc': '68727-150-01', 'description'...",20221231.0,"{'manufacturer_name': ['Jazz Pharmaceuticals, ...",NDA,SOLUTION,78a7a9e9-9f25-4f8a-8f1e-a6c6420801c3,HUMAN PRESCRIPTION DRUG,20201102,68727-150_78a7a9e9-9f25-4f8a-8f1e-a6c6420801c3,NDA212690,XYWAV,,"[{'name': 'CALCIUM OXYBATE', 'strength': '.5 g...",ORAL,"['Central Nervous System Depressant [EPC]', 'C...",,CIII,True,"['CALCIUM OXYBATE', 'MAGNESIUM OXYBATE', 'POTA...","['.5 g/mL', '.5 g/mL', '.5 g/mL', '.5 g/mL']",212690,,,,,,,,,,,,,,,38610.0,1143621.0,29.619826,,,,0.0,0.0
1,2021-01-01,"(calcium, magnesium, potassium, and sodium oxy...",0.0,0.0,303030.0,8858441.0,2.0,0,1.0,0.0,0.0,0.0,2.0,1.0,"Jazz Pharmaceuticals, Inc.",1,State Drug Utilization Data 2021,68727,150,2021,1,68727-150,"(CALCIUM, MAGNESIUM, POTASSIUM, AND SODIUM OXY...",XYWAV,True,"[{'package_ndc': '68727-150-01', 'description'...",20221231.0,"{'manufacturer_name': ['Jazz Pharmaceuticals, ...",NDA,SOLUTION,78a7a9e9-9f25-4f8a-8f1e-a6c6420801c3,HUMAN PRESCRIPTION DRUG,20201102,68727-150_78a7a9e9-9f25-4f8a-8f1e-a6c6420801c3,NDA212690,XYWAV,,"[{'name': 'CALCIUM OXYBATE', 'strength': '.5 g...",ORAL,"['Central Nervous System Depressant [EPC]', 'C...",,CIII,True,"['CALCIUM OXYBATE', 'MAGNESIUM OXYBATE', 'POTA...","['.5 g/mL', '.5 g/mL', '.5 g/mL', '.5 g/mL']",212690,,,,,,,,,,,,,,,303030.0,8858441.0,29.232883,,,,0.0,0.0
2,2019-04-01,(daunorubicin and cytarabine) liposome-injecti...,0.0,0.0,43.234,328599.7,1.0,0,1.0,0.0,0.0,0.0,1.0,1.0,"Jazz Pharmaceuticals, Inc.",1,State Drug Utilization Data 2019,68727,745,2019,2,68727-745,(DAUNORUBICIN AND CYTARABINE) LIPOSOME,VYXEOS,True,"[{'package_ndc': '68727-745-01', 'description'...",20221231.0,"{'manufacturer_name': ['Jazz Pharmaceuticals, ...",NDA,"INJECTION, POWDER, LYOPHILIZED, FOR SUSPENSION",31000140-b8f4-4608-903a-d755b8067c68,HUMAN PRESCRIPTION DRUG,20170803,68727-745_31000140-b8f4-4608-903a-d755b8067c68,NDA209401,VYXEOS,,"[{'name': 'CYTARABINE', 'strength': '100 mg/20...",INTRAVENOUS,['Anthracycline Topoisomerase Inhibitor [EPC]'...,,,True,"['CYTARABINE', 'DAUNORUBICIN']","['100 mg/20mL', '44 mg/20mL']",209401,,,,,,,,,,,,,,,43.234,328599.7,7600.491715,,,,0.0,0.0
3,2019-07-01,(daunorubicin and cytarabine) liposome-injecti...,0.0,0.0,37.59,284597.7,2.0,0,1.0,0.0,0.0,0.0,2.0,1.0,"Jazz Pharmaceuticals, Inc.",1,State Drug Utilization Data 2019,68727,745,2019,3,68727-745,(DAUNORUBICIN AND CYTARABINE) LIPOSOME,VYXEOS,True,"[{'package_ndc': '68727-745-01', 'description'...",20221231.0,"{'manufacturer_name': ['Jazz Pharmaceuticals, ...",NDA,"INJECTION, POWDER, LYOPHILIZED, FOR SUSPENSION",31000140-b8f4-4608-903a-d755b8067c68,HUMAN PRESCRIPTION DRUG,20170803,68727-745_31000140-b8f4-4608-903a-d755b8067c68,NDA209401,VYXEOS,,"[{'name': 'CYTARABINE', 'strength': '100 mg/20...",INTRAVENOUS,['Anthracycline Topoisomerase Inhibitor [EPC]'...,,,True,"['CYTARABINE', 'DAUNORUBICIN']","['100 mg/20mL', '44 mg/20mL']",209401,,,,,,,,,,,,,,,37.59,284597.7,7571.101676,,,,0.0,0.0
4,2019-10-01,(daunorubicin and cytarabine) liposome-injecti...,0.0,0.0,8.204,15679.96,3.0,0,1.0,0.0,0.0,0.0,3.0,1.0,"Jazz Pharmaceuticals, Inc.",1,State Drug Utilization Data 2019,68727,745,2019,4,68727-745,(DAUNORUBICIN AND CYTARABINE) LIPOSOME,VYXEOS,True,"[{'package_ndc': '68727-745-01', 'description'...",20221231.0,"{'manufacturer_name': ['Jazz Pharmaceuticals, ...",NDA,"INJECTION, POWDER, LYOPHILIZED, FOR SUSPENSION",31000140-b8f4-4608-903a-d755b8067c68,HUMAN PRESCRIPTION DRUG,20170803,68727-745_31000140-b8f4-4608-903a-d755b8067c68,NDA209401,VYXEOS,,"[{'name': 'CYTARABINE', 'strength': '100 mg/20...",INTRAVENOUS,['Anthracycline Topoisomerase Inhibitor [EPC]'...,,,True,"['CYTARABINE', 'DAUNORUBICIN']","['100 mg/20mL', '44 mg/20mL']",209401,,,,,,,,,,,,,,,8.204,15679.96,1911.257854,,,,0.0,0.0


# Descriptive of Data

Counting number of unique drugs

In [93]:
df_temp = df_reg

In [94]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total: ', df_temp['unique_id'].nunique())
print('Total (generic_name): ', df_temp['generic_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01        148
1  1991-04-01        159
2  1991-07-01        165
3  1991-10-01        165
4  1992-01-01        181
Quarterly Average:  1599.6446280991736
Total:  2886
Total (generic_name):  1155


# Regression for generic paradox

## For org. labeler

In [108]:
df_temp = df_reg
len(df_temp)

193557

Filtering on orig. labeler

In [109]:
df_temp = df_temp[df_temp['marketing_category'] == "NDA"]
len(df_temp)

122378

Calculating the log of price per unit

In [110]:
df_temp['ln_price_per_unit_adj'] = np.log(df_temp['price_per_unit_adj'])

Calculating the share of units reimbursed for the org. labeler

In [111]:
df_temp['org_share_units_reimbursed_sum'] = 1 - df_temp['generic_share_units_reimbursed_sum']

Change the format of the date to a integer

In [112]:
df_temp['date_int'] = df_temp['date'].str.replace('-', '')
df_temp['date_int'] = df_temp['date_int'].apply(pd.to_numeric)

Setting the index

In [113]:
df_OLS_ndc_gp = df_temp.set_index(['unique_id', 'date_int'])

Estimating the regression

In [114]:
df_OLS_ndc_gp.dtypes

date                                              object
generic_units_reimbursed_sum                     float64
generic_total_amount_reimbursed_adj_sum          float64
units_reimbursed_sum                             float64
total_amount_reimbursed_adj_sum                  float64
                                                  ...   
running_count_event                              float64
generic_share_units_reimbursed_sum               float64
generic_share_total_amount_reimbursed_adj_sum    float64
ln_price_per_unit_adj                            float64
org_share_units_reimbursed_sum                   float64
Length: 69, dtype: object

In [115]:
df_OLS_ndc_gp['running_count_generics'] = pd.to_numeric(df_OLS_ndc_gp['running_count_generics'], errors='coerce').fillna(0, downcast='infer')
df_OLS_ndc_gp['running_count_unique_substitute'] = pd.to_numeric(df_OLS_ndc_gp['running_count_unique_substitute'], errors='coerce').fillna(0, downcast='infer')
df_OLS_ndc_gp['org_share_units_reimbursed_sum'] = pd.to_numeric(df_OLS_ndc_gp['org_share_units_reimbursed_sum'], errors='coerce').fillna(0, downcast='infer')
df_OLS_ndc_gp['running_count_from_second_entrance'] = pd.to_numeric(df_OLS_ndc_gp['running_count_from_second_entrance'], errors='coerce').fillna(0, downcast='infer')

In [122]:
endog = df_OLS_ndc_gp['ln_price_per_unit_adj']
exog_vars = ['org_share_units_reimbursed_sum'] # ['running_count_generics', 'running_count_unique_substitute', 'org_share_units_reimbursed_sum', 'running_count_from_second_entrance']
exog = sm.add_constant(df_OLS_ndc_gp[exog_vars])

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                            PooledOLS Estimation Summary                           
Dep. Variable:     ln_price_per_unit_adj   R-squared:                        0.0033
Estimator:                     PooledOLS   R-squared (Between):             -0.0429
No. Observations:                 122378   R-squared (Within):              -0.0306
Date:                   Tue, Mar 22 2022   R-squared (Overall):              0.0033
Time:                           21:50:26   Log-likelihood                -2.729e+05
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      407.14
Entities:                           2886   P-value                           0.0000
Avg Obs:                          42.404   Distribution:                F(1,122376)
Min Obs:                          1.0000                                           
Max Obs:                          190.00   F-statistic (robust):            

# Regression for probability of entrance

### Second entrance

In [123]:
df_temp = df_reg

Assign a dummy for second entrance

In [124]:
df_temp['second_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 2 else 0)
df_temp['second_entrance'] = df_temp['second_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [125]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Change the format of the date to a integer

In [126]:
df_temp['date'] = df_temp['date'].str.replace('-', '')
df_temp['date'] = df_temp['date'].apply(pd.to_numeric)

Setting index

In [127]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

In [128]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                      4.81e-07
Estimator:                  PooledOLS   R-squared (Between):             -0.2692
No. Observations:              193557   R-squared (Within):            4.417e-05
Date:                Tue, Mar 22 2022   R-squared (Overall):            4.81e-07
Time:                        21:50:43   Log-likelihood                 -6.99e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.0931
Entities:                        2886   P-value                           0.7603
Avg Obs:                       67.068   Distribution:                F(1,193555)
Min Obs:                       1.0000                                           
Max Obs:                       612.00   F-statistic (robust):             0.0931
                            

### Third entrance

In [54]:
df_temp = df_reg

Assign a dummy for second entrance

In [55]:
df_temp['third_entrance'] = df_temp['labeler_name_count'].apply(lambda x: 1 if x == 3 else 0)
df_temp['third_entrance'] = df_temp['third_entrance'].apply(pd.to_numeric)

Calculating the log of the sum of total amount reimbursed

In [56]:
df_temp['ln_total_amount_reimbursed_adj_sum'] = np.log(df_temp['total_amount_reimbursed_adj_sum'])

Setting index

In [57]:
df_OLS = df_temp.set_index(['unique_id', 'date'])

Estimating the regression

In [58]:
exog_vars = ['ln_total_amount_reimbursed_adj_sum']
exog = sm.add_constant(df_OLS[exog_vars])
endog = df_OLS['second_entrance']

mod = PooledOLS(endog, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        second_entrance   R-squared:                        0.0011
Estimator:                  PooledOLS   R-squared (Between):             -0.2013
No. Observations:              225135   R-squared (Within):              -0.0011
Date:                Fri, Mar 18 2022   R-squared (Overall):              0.0011
Time:                        12:24:48   Log-likelihood                -7.836e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      242.12
Entities:                        3012   P-value                           0.0000
Avg Obs:                       74.746   Distribution:                F(1,225133)
Min Obs:                       1.0000                                           
Max Obs:                       1255.0   F-statistic (robust):             242.12
                            