# Libraries

In [None]:
import requests

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # removing some warnings
pd.set_option('display.max_columns', None) # display all columns in DF


import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import seaborn as sns

from ast import literal_eval

import time
import datetime as dt
import math

# Loading Data

In [1042]:
df_reg_org = pd.read_csv('raw_consolidation_data.csv', sep = '~')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [1043]:
df_temp = df_reg_org.loc[df_reg_org['is_original_packager'] == True]

In [1044]:
df_temp = df_reg_org

In [1045]:
len(df_temp)

515574

In [1046]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total: ', df_temp['unique_id'].nunique())
print('Total (generic_name): ', df_temp['generic_name'].nunique())
print('Total (labeler): ', df_temp['labeler_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01        544
1  1991-04-01        568
2  1991-07-01        580
3  1991-10-01        582
4  1992-01-01        612
Quarterly Average:  4260.94214876033
Total:  7214
Total (generic_name):  2117
Total (labeler):  567


# Data manipulation

Removing labeller that are original packager

In [1047]:
print(len(df_reg_org))
df_reg = df_reg_org.loc[df_reg_org['is_original_packager'] == True]
print(len(df_reg))

515574
496657


### Assigning order of entrance for each unique drug

In [1048]:
df_temp = df_reg.sort_values(by = ['unique_id', 'date', 'marketing_category'], ascending = [False, True, False], ignore_index = True) # Sorting

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'labeler_name'], keep = 'first') # Finding the first entrance for each labeller 

df_temp['labeler_name_count'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['unique_id', 'labeler_name', 'labeler_name_count']], df_reg, on = ['unique_id', 'labeler_name'], how = 'right') # Merging with org. data

In [1049]:
print(len(df_reg))

496657


### Assigning a dummy for first entrance and NDA

In [1050]:
df_temp = df_reg.loc[(df_reg['labeler_name_count'] == 1) & (df_reg['marketing_category'] == 'NDA')] # Filtering

df_temp = df_temp.sort_values(by = ['date', 'unique_id'], ascending = [True, False], ignore_index = True) # Sorting
df_temp = df_temp.drop_duplicates(subset = ['unique_id'], keep = 'first') # Removing dublicates

df_temp['first_nda'] = 1 # Adding dummy

df_reg = pd.merge(df_temp[['unique_id', 'first_nda']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['first_nda'] = df_reg['first_nda'].fillna(0)

In [1051]:
print(len(df_reg))

496657


Removing unique drug where the first entrance is not a NDA

In [1052]:
df_reg = df_reg.loc[(df_reg['first_nda'] == 1)]

In [1053]:
len(df_reg)

201875

### Assigning a running count for each unique drug from the start

In [1054]:
df_temp = df_reg.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_start'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drung

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_start']], df_reg, on = ['date', 'unique_id',], how = 'right')

In [1055]:
len(df_reg)

201875

### Assigning a running count for each unique drug from second entrance

In [1056]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.drop_duplicates(subset = ['date', 'unique_id']) # Removing dublicates
df_temp = df_temp[['date', 'unique_id']]
df_temp = df_temp.sort_values(by = ['unique_id', 'date'])

df_temp['running_count_from_second_entrance'] = df_temp.groupby((df_temp['unique_id'] != df_temp['unique_id'].shift(1)).cumsum()).cumcount() + 1 # Running count for each drug

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_from_second_entrance']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['running_count_from_second_entrance'] = df_reg['running_count_from_second_entrance'].fillna(0)

In [1057]:
len(df_reg)

201875

### Assigning a total count of the labelers at the start (to exclude unique drug with multiple labelers from the start)

In [1058]:
df_temp = df_reg.loc[(df_reg['running_count_from_start'] == 1)]
          
df_temp = df_temp.groupby(['unique_id'])['first_nda'].sum()
df_temp = df_temp.reset_index()
df_temp = df_temp.rename(columns = {"first_nda": "number_of_first_entrance"})

df_temp = df_temp.loc[(df_temp['number_of_first_entrance'] > 1)]

df_reg = pd.merge(df_temp[['unique_id', 'number_of_first_entrance']], df_reg, on = ['unique_id'], how = 'right') # Merging with reg. data

df_reg['number_of_first_entrance'] = df_reg['number_of_first_entrance'].fillna(1)

Removing unique drug which has multiple entrances at start

In [1059]:
df_reg = df_reg.loc[(df_reg['number_of_first_entrance'] == 1)]

In [1060]:
len(df_reg)

193215

### Assigning a running count of generic labelers

In [1061]:
df_temp = df_reg[['date', 'unique_id', 'labeler_name', 'labeler_name_count']]

df_temp = df_temp.groupby(['date', 'unique_id'])

df_temp = df_temp.max()

df_temp = df_temp.reset_index()

df_temp['running_count_generics'] = df_temp['labeler_name_count'] - 1 # Minus by one because org. labeler do not count

df_reg = pd.merge(df_temp[['date', 'unique_id', 'running_count_generics']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with org. data

### Assigning dummies for generic entries

In [1062]:
df_temp = df_reg[['date', 'unique_id']]
df_temp['labeler_name_count_2_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 0)  else 0)
df_temp['labeler_name_count_3_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 1)  else 0)
df_temp['labeler_name_count_4_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 2)  else 0)
df_temp['labeler_name_count_5_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 3)  else 0)
df_temp['labeler_name_count_6_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 4)  else 0)
df_temp['labeler_name_count_7_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 5)  else 0)
df_temp['labeler_name_count_8_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 6)  else 0)
df_temp['labeler_name_count_9_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 7)  else 0)
df_temp['labeler_name_count_10_dummy'] = df_reg['running_count_generics'].apply(lambda x: 1 if (x > 8)  else 0)

In [1063]:
df_temp = df_temp.groupby(['date', 'unique_id'], as_index = False)['labeler_name_count_2_dummy', 'labeler_name_count_3_dummy', 'labeler_name_count_4_dummy', 'labeler_name_count_5_dummy', 'labeler_name_count_6_dummy', 'labeler_name_count_7_dummy', 'labeler_name_count_8_dummy', 'labeler_name_count_9_dummy', 'labeler_name_count_10_dummy'].sum()

number = 0
for (column_name, column_value) in df_temp.iteritems():
    number = number + 1
    if number > 2:
        df_temp[column_name] = np.where(df_temp[column_name] >= 1, 1, 0)

  df_temp = df_temp.groupby(['date', 'unique_id'], as_index = False)['labeler_name_count_2_dummy', 'labeler_name_count_3_dummy', 'labeler_name_count_4_dummy', 'labeler_name_count_5_dummy', 'labeler_name_count_6_dummy', 'labeler_name_count_7_dummy', 'labeler_name_count_8_dummy', 'labeler_name_count_9_dummy', 'labeler_name_count_10_dummy'].sum()


In [1064]:
df_reg = pd.merge(df_temp[['date', 'unique_id', 'labeler_name_count_2_dummy', 'labeler_name_count_3_dummy', 'labeler_name_count_4_dummy', 'labeler_name_count_5_dummy', 'labeler_name_count_6_dummy', 'labeler_name_count_7_dummy', 'labeler_name_count_8_dummy', 'labeler_name_count_9_dummy', 'labeler_name_count_10_dummy']], df_reg, on = ['date', 'unique_id'], how = 'right') # Merging with reg. data

In [1065]:
len(df_reg)

193215

### Assigning a running count of substitutes labelers

Unpacking pharm class to obtain EPC and MoA

In [1066]:
df_temp = pd.DataFrame(df_reg[['unique_id', 'pharm_class']])

df_temp = df_temp.drop_duplicates(subset = ['unique_id', 'pharm_class'])

df_temp = df_temp.dropna(subset = ['pharm_class'])
df_temp['pharm_class'] = df_temp['pharm_class'].apply(literal_eval)
df_temp = df_temp.explode('pharm_class')

df_temp['pharm_class_type'] = df_temp['pharm_class'].apply(lambda x: str(x)[-5:])
df_temp['pharm_class_type'] = df_temp['pharm_class_type'].str.replace(r'[][]', '', regex=True)

df_temp_EPC = df_temp[df_temp['pharm_class_type'] == 'EPC']
df_temp_MoA = df_temp[df_temp['pharm_class_type'] == 'MoA']

df_temp_EPC = df_temp_EPC.rename(columns = {"pharm_class": "pharm_class_EPC"})
df_temp_MoA = df_temp_MoA.rename(columns = {"pharm_class": "pharm_class_MoA"})

df_temp_EPC = df_temp_EPC.drop_duplicates(subset = ['unique_id', 'pharm_class_EPC'])
df_temp_MoA = df_temp_MoA.drop_duplicates(subset = ['unique_id', 'pharm_class_MoA'])

df_pharm_class_type = pd.merge(df_temp_EPC, df_temp_MoA,  on = 'unique_id', how = 'left')

df_pharm_class_type = df_pharm_class_type.dropna(subset = ['pharm_class_EPC'])
df_pharm_class_type = df_pharm_class_type.dropna(subset = ['pharm_class_MoA'])

Adding route

In [1067]:
df_openFDA_NDC = pd.read_csv('raw_openFDA_NDC_data.csv', sep = '~')

In [1068]:
df_temp = df_openFDA_NDC.drop_duplicates(subset = ['unique_id'])

df_temp = df_temp.dropna(subset = ['route'])

df_pharm_class_type = pd.merge(df_pharm_class_type, df_temp[['unique_id', 'route']],  on = 'unique_id', how = 'left')

Adding ATC (level 2)

In [1069]:
df_ATC = pd.read_csv('raw_consolidation_data_for_ATC.csv', sep = '~')

In [1070]:
df_temp = df_ATC.drop_duplicates(subset = ['unique_id'])

df_temp = df_temp.dropna(subset = ['ATC (level 2)'])

df_pharm_class_type = pd.merge(df_pharm_class_type, df_temp[['unique_id', 'ATC (level 2)']],  on = 'unique_id', how = 'left')

Creating a unique id for substitutes

In [1071]:
df_pharm_class_type['unique_substitute'] = df_pharm_class_type['ATC (level 2)'].astype(str) + '-' + df_pharm_class_type['pharm_class_EPC'].astype(str) + '-' + df_pharm_class_type['route'].astype(str) + '-' + df_pharm_class_type['pharm_class_MoA'].astype(str)
df_pharm_class_type = df_pharm_class_type[['unique_id', 'unique_substitute']]

df_pharm_class_type = df_pharm_class_type.drop_duplicates(subset = ['unique_id'])

Merging with org. data

In [1072]:
df_reg = pd.merge(df_reg, df_pharm_class_type[['unique_id', 'unique_substitute']],  on = 'unique_id', how = 'left')

Calculating number of substitutes

In [1073]:
df_temp = df_reg.groupby(['date', 'unique_substitute'])['unique_id'].agg('nunique')
df_temp = df_temp.reset_index()

df_temp['running_count_unique_substitute'] = df_temp['unique_id'] - 1

df_reg = pd.merge(df_reg, df_temp[['date', 'unique_substitute', 'running_count_unique_substitute']],  on = ['date', 'unique_substitute'], how = 'left')

In [1074]:
len(df_reg)

193215

### Assigning a running count where the first entrance is the starting point

Finding the running count from the start for unique drugs

In [1075]:
df_temp = df_reg[['date', 'unique_id', 'running_count_from_start', 'running_count_from_second_entrance']]

df_temp['col_temp'] = df_temp['running_count_from_second_entrance'].map(lambda x: True if (x == 1.0)  else False)

df_temp = df_temp[df_temp['col_temp'] == True]
df_temp = df_temp.drop(columns=['col_temp'])
df_temp = df_temp.drop_duplicates(subset=['unique_id'])
df_temp = df_temp.rename(columns = {"running_count_from_start": "col_temp"})
df_temp = df_temp[['unique_id', 'col_temp']]

Merging with org. data

In [1076]:
df_reg = pd.merge(df_temp[['unique_id', 'col_temp']], df_reg, on = ['unique_id',], how = 'right')

Calculationg the running count

In [1077]:
df_reg['col_temp'] = df_reg['col_temp'].fillna(0)
df_reg['col_temp'] = df_reg['running_count_from_start'] - df_reg['col_temp']

df_reg['running_count_event'] = df_reg['col_temp'].where(df_reg['col_temp'] != df_reg['running_count_from_start'])

df_reg = df_reg.drop(columns=['col_temp'])

In [1078]:
len(df_reg)

193215

In [1079]:
df_temp = df_reg.groupby(['unique_id']).agg({'running_count_event': [np.min,np.max]})
df_temp = df_temp.reset_index()
df_temp = df_temp.droplevel(0, axis=1) 

df_temp = df_temp.rename(columns = {"": "unique_id", "amin": "min_quarter_before_second_entrance", "amax": "max_quarter_before_second_entrance"})
df_temp = df_temp.drop_duplicates(subset = ['unique_id'])

Merging with org. data

In [1080]:
df_reg = pd.merge(df_temp[['unique_id', 'min_quarter_before_second_entrance', 'max_quarter_before_second_entrance']], df_reg, on = ['unique_id',], how = 'right')

### Assigning a dummy for 2006 (change in Medicare which affect Medicaid)

In [1081]:
df_reg['dummy_2006'] = df_reg['year'].apply(lambda x: 1 if x == 2006 else 0)

### Calculating the quarterly total amount reimbursed and units reimbursed for all labelers

In [1082]:
df_temp = df_reg.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "units_reimbursed_sum", "total_amount_reimbursed_adj": "total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'units_reimbursed_sum', 'total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

### Calculating the generic labelers' share of quarterly total amount reimbursed and units reimbursed

In [1083]:
df_temp = df_reg[df_reg['labeler_name_count'] >= 2]

df_temp = df_temp.set_index(['date', 'unique_id']).groupby(level = ['date', 'unique_id'])[['units_reimbursed', 'total_amount_reimbursed_adj']].agg('sum')
df_temp = df_temp.reset_index()

df_temp = df_temp.rename(columns = {"units_reimbursed": "generic_units_reimbursed_sum", "total_amount_reimbursed_adj": "generic_total_amount_reimbursed_adj_sum"})

df_reg = pd.merge(df_temp[['date', 'unique_id', 'generic_units_reimbursed_sum', 'generic_total_amount_reimbursed_adj_sum']], df_reg, on = ['date', 'unique_id',], how = 'right')

df_reg['generic_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'].fillna(0)
df_reg['generic_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'].fillna(0)

In [1084]:
df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_units_reimbursed_sum'] / df_reg['units_reimbursed_sum']
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_total_amount_reimbursed_adj_sum'] / df_reg['total_amount_reimbursed_adj_sum']

df_reg['generic_share_units_reimbursed_sum'] = df_reg['generic_share_units_reimbursed_sum'].fillna(0)
df_reg['generic_share_total_amount_reimbursed_adj_sum'] = df_reg['generic_share_total_amount_reimbursed_adj_sum'].fillna(0)

...

# Downloading Data

In [4]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

NameError: name 'df_reg' is not defined

# Loading Data

In [86]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

  exec(code_obj, self.user_global_ns, self.user_ns)


# Descriptive of Data

Counting number of unique drugs

In [4]:
df_temp = df_reg

In [5]:
df_temp = df_temp.dropna(subset = ['running_count_event'])

In [6]:
len(df_temp)

135076

In [7]:
print('Total of drug: ', df_temp['unique_id'].count())

Total of drug:  135076


In [8]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total of unique drug: ', df_temp['unique_id'].nunique())
print('Total of unique generic producers: ', df_temp['generic_name'].nunique())
print('Total of unique brand name producers: ', df_temp['labeler_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01         93
1  1991-04-01         96
2  1991-07-01         96
3  1991-10-01         96
4  1992-01-01        105
Quarterly Average:  1116.3305785123966
Total of unique drug:  1005
Total of unique generic producers:  391
Total of unique brand name producers:  289


### Balanced data: Number of entrance with more than one generic producer

In [34]:
df_temp = df_reg
df_temp = df_temp.dropna(subset = ['running_count_event'])
df_temp = df_temp[(df_temp['min_quarter_before_second_entrance'] <= -4) & (df_temp['max_quarter_before_second_entrance'] >= 16)]
df_temp = df_temp[df_temp['running_count_event'] == 0]

In [35]:
print(df_temp['unique_id'].nunique())

621


In [31]:
df_temp = df_temp[df_temp['running_count_generics']  >= 2]
print(df_temp['unique_id'].nunique())

108


### Balanced data: Number of drugs and producers

In [87]:
df_reg_trans = pd.read_csv('output_regression_transform.csv', sep = '~')

In [88]:
df_temp = df_reg_trans
df_temp = df_temp.dropna(subset = ['running_count_event'])
df_temp = df_temp[(df_temp['min_quarter_before_second_entrance'] <= -4) & (df_temp['max_quarter_after_second_entrance'] >= 60)]
#df_temp = df_temp[df_temp['running_count_event'] == 60]

In [89]:
print(df_temp['unique_id'].nunique())

84


In [90]:
df_temp['running_count_generics'].mean() + 1

2.7912678782664804

# Outputting data for regression

In [7]:
df_temp = df_reg

Assigning a dummy for brand-name and generic price

In [60]:
df_temp['producer_type'] = df_temp['labeler_name_count'].apply(lambda x: 'Brand-name producer' if x == 1 else 'Generic producer')

Removing brand-name drugs without competition based on "running_count_event"

In [61]:
df_temp = df_temp.dropna(subset = ['running_count_event'])

Grouping by

In [62]:
df_temp = df_temp.groupby(['date', 'year', 'quarter', 'unique_id', 'producer_type', 'labeler_name_count', 'running_count_event', 'min_quarter_before_second_entrance', 'max_quarter_before_second_entrance'], as_index = False)[['price_per_unit_adj']].mean()

Pivotting the data

In [63]:
df_temp = df_temp.pivot(index = ['date', 'year', 'quarter', 'unique_id', 'running_count_event', 'min_quarter_before_second_entrance', 'max_quarter_before_second_entrance'], columns = 'labeler_name_count')['price_per_unit_adj']
df_temp = df_temp.reset_index()
df_temp = df_temp.rename_axis(None, axis=1)
df_temp = df_temp.rename(columns = {"producer_type": "index", "Brand-name producer": "price_per_unit_adj_b", "Generic producer": "price_per_unit_adj_g", "max_quarter_before_second_entrance": "max_quarter_after_second_entrance"})

Adding additional variable from org. data

In [64]:
df_reg = pd.read_csv('output_regression_org.csv', sep = '~')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [65]:
df_reg_temp = df_reg[['date', 'unique_id', 'units_reimbursed_sum', 'generic_units_reimbursed_sum', 'running_count_generics', 'running_count_unique_substitute', 'labeler_name_count_2_dummy', 'labeler_name_count_3_dummy', 'labeler_name_count_4_dummy', 'labeler_name_count_5_dummy', 'labeler_name_count_6_dummy', 'labeler_name_count_7_dummy', 'labeler_name_count_8_dummy', 'labeler_name_count_9_dummy', 'labeler_name_count_10_dummy', 'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A10', 'A11', 'A12', 'A14', 'A16', 'B01', 'B02', 'B03', 'B05', 'B06', 'C01', 'C02', 'C03', 'C04', 'C05', 'C07', 'C08', 'C09', 'C10', 'D01', 'D04', 'D05', 'D06', 'D07', 'D08', 'D10', 'D11', 'G01', 'G02', 'G03', 'G04', 'H01', 'H02', 'H03', 'H04', 'H05', 'J01', 'J02', 'J04', 'J05', 'J06', 'L01', 'L02', 'L03', 'L04', 'M01', 'M03', 'M04', 'M05', 'M09', 'N01', 'N02', 'N03', 'N04', 'N05', 'N06', 'N07', 'P01', 'P02', 'P03', 'R01', 'R03', 'R05', 'R06', 'R07', 'S01', 'V03', 'V04', 'V08']]
df_reg_temp = df_reg_temp.drop_duplicates(subset = ['date', 'unique_id'])
df_reg_trans = pd.merge(df_temp, df_reg_temp, on = ['date', 'unique_id'], how = 'left')

### Downloading Data

In [15]:
df_reg_trans.to_excel("output_regression_transform_excel.xlsx") 

In [16]:
df_reg_trans.to_csv('output_regression_transform.csv', sep = '~', index = False)

In [17]:
df_reg_trans.to_stata('data_reg.dta', version=117)

C:\Users\sebas\anaconda3\lib\site-packages\pandas\io\stata.py:2397: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    min_quarter_before_second_entrance   ->   min_quarter_before_second_entran
    max_quarter_after_second_entrance   ->   max_quarter_after_second_entranc
    1   ->   _1
    2   ->   _2
    3   ->   _3
    4   ->   _4
    5   ->   _5
    6   ->   _6
    7   ->   _7
    8   ->   _8
    9   ->   _9
    10   ->   _10
    11   ->   _11
    12   ->   _12
    13   ->   _13
    14   ->   _14
    15   ->   _15
    16   ->   _16
    17   ->   _17
    18   ->   _18
    19   ->   _19
    20   ->   _20
    21   ->   _21

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



# Output for welfare analysis

In [72]:
df_reg.to_csv('output_regression_org.csv', sep = '~', index = False)

In [73]:
df_temp = df_reg

In [74]:
df_temp = df_temp.dropna(subset = ['running_count_event'])
df_temp = df_temp[(df_temp['min_quarter_before_second_entrance'] <= -4) & (df_temp['max_quarter_before_second_entrance'] >= 16)]
df_temp['producer_type'] = df_temp['labeler_name_count'].apply(lambda x: 'Brand-name producer' if x == 1 else 'Generic producer')

In [76]:
df_temp_price = df_temp.groupby(['labeler_name_count', 'running_count_event'], as_index = False)['price_per_unit_adj'].mean()

In [78]:
df_temp_price = df_temp_price.pivot(index = ['running_count_event'], columns = 'labeler_name_count')['price_per_unit_adj']
df_temp_price = df_temp_price.reset_index()
df_temp_price = df_temp_price.rename_axis(None, axis=1)

In [79]:
df_temp_price.head()

Unnamed: 0,running_count_event,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,-101.0,2.954721,,,,,,,,,,,,,,,,,,,,
1,-100.0,2.743967,,,,,,,,,,,,,,,,,,,,
2,-99.0,2.941796,,,,,,,,,,,,,,,,,,,,
3,-98.0,5.85299,,,,,,,,,,,,,,,,,,,,
4,-97.0,5.039009,,,,,,,,,,,,,,,,,,,,


In [80]:
df_temp_quantity = df_temp_quantity.groupby(['running_count_event'], as_index = False)['units_reimbursed'].sum()

In [81]:
df_wel = pd.merge(df_temp_price, df_temp_quantity, on = ['running_count_event'], how = 'inner') # Merging with reg. data

In [82]:
df_wel = df_wel[(df_wel['running_count_event'] >= -4) & (df_wel['running_count_event'] <= 16)]

In [85]:
df_wel.to_excel("output_welfare_excel.xlsx") 

# Control of Final Data

In [20]:
df_temp = df_reg_trans

Number of obs. in the effect window

In [32]:
df_temp = df_temp[(df_temp['running_count_event'] >= -20) & (df_temp['running_count_event'] <= 40)]
len(df_temp)

41307

Number of obs. with brand name price (model 1)

In [29]:
len(df_temp.dropna(subset = ['price_per_unit_adj_b']))

38024

Number of obs. with brand name price and ATC (model 1)

In [28]:
len(df_temp.dropna(subset = ['price_per_unit_adj_b', 'A01']))

37194

Number of obs. with brand name and generic price (model 2)

In [37]:
len(df_temp.dropna(subset = ['price_per_unit_adj_b', 'price_per_unit_adj_g']))

20120

Number of obs. with brand name and generic price, and ATC (model 2)

In [36]:
len(df_temp.dropna(subset = ['price_per_unit_adj_b', 'price_per_unit_adj_g', 'A01']))

19676

Number of obs. with brand name, price, ATC, and dummies (model 3)

In [40]:
len(df_temp.dropna(subset = ['price_per_unit_adj_b', 'labeler_name_count_2_dummy', 'A01']))

37194

In [41]:
len(df_temp.dropna(subset = ['price_per_unit_adj_g', 'labeler_name_count_2_dummy', 'A01']))

22889

Counting number of unique drugs

In [50]:
df_temp = df_reg

In [51]:
df_temp = df_temp[(df_temp['running_count_event'] >= -20) & (df_temp['running_count_event'] <= 40)]
len(df_temp)

96593

In [52]:
df_temp_count = df_temp.groupby(['date'])['unique_id'].count()
df_temp_count = df_temp_count.reset_index()
print('Tabel: ', df_temp_count.head())
print('Quarterly Average: ', df_temp_count['unique_id'].mean())
print('Total of unique drug: ', df_temp['unique_id'].nunique())
print('Total of unique generic producers: ', df_temp['generic_name'].nunique())
print('Total of unique brand name producers: ', df_temp['labeler_name'].nunique())

Tabel:           date  unique_id
0  1991-01-01         15
1  1991-04-01         17
2  1991-07-01         17
3  1991-10-01         17
4  1992-01-01         18
Quarterly Average:  798.2892561983471
Total of unique drug:  1005
Total of unique generic producers:  391
Total of unique brand name producers:  278


# Descriptive of Final Data

## For ATC

In [9]:
df_descriptive_act = df_reg_trans

Filtering the variables and setting the range as well as normalizing

In [10]:
df_descriptive_act = df_descriptive_act[['running_count_event', 'unique_id', 'price_per_unit_adj_b', 'price_per_unit_adj_g']]
df_descriptive_act = df_descriptive_act[(df_descriptive_act['running_count_event'] >= 0) & (df_descriptive_act['running_count_event'] <= 40)]
df_descriptive_act = df_descriptive_act[df_descriptive_act['running_count_event'] != -1]

Loading the consolidated data for ATC

In [11]:
df_atc = pd.read_csv('raw_consolidation_data_for_ATC.csv', sep = '~')

Merging the transposed and ATC data

In [12]:
df_descriptive_act = pd.merge(df_descriptive_act, df_atc[['unique_id', 'ATC (level 2)']], on = ['unique_id'], how = 'left')

Grouping the data

In [13]:
df_descriptive_act = df_descriptive_act.groupby(['ATC (level 2)', 'unique_id', 'running_count_event'], as_index = False)['price_per_unit_adj_b', 'price_per_unit_adj_g'].sum()

  df_descriptive_act = df_descriptive_act.groupby(['ATC (level 2)', 'unique_id', 'running_count_event'], as_index = False)['price_per_unit_adj_b', 'price_per_unit_adj_g'].sum()


### Brand Name Price

Pivotting the data to ensure the correct format

In [14]:
df_descriptive_act_brand_name = df_descriptive_act.pivot(index = ['unique_id', 'running_count_event'], columns = 'ATC (level 2)')['price_per_unit_adj_b']
df_descriptive_act_brand_name = df_descriptive_act_brand_name.reset_index()

Describing the data and changing the format

In [15]:
df_descriptive_act_brand_name = df_descriptive_act_brand_name.describe()
df_descriptive_act_brand_name = df_descriptive_act_brand_name.drop(columns = ['running_count_event'])
df_descriptive_act_brand_name = df_descriptive_act_brand_name.T
df_descriptive_act_brand_name = df_descriptive_act_brand_name.round(2)

Printing the describing the data as latex-table

In [16]:
print(df_descriptive_act_brand_name.to_latex(index = True)) 

\begin{tabular}{lrrrrrrrr}
\toprule
{} &   count &    mean &     std &     min &     25\% &     50\% &     75\% &      max \\
ATC (level 2) &         &         &         &         &         &         &         &          \\
\midrule
A01           &   317.0 &    4.79 &    8.51 &    0.00 &    0.21 &    1.01 &    6.30 &    70.52 \\
A02           &   236.0 &    7.30 &    5.53 &    0.00 &    2.87 &    5.09 &   12.89 &    40.75 \\
A04           &    51.0 &  385.08 &  478.22 &    0.00 &  143.89 &  210.65 &  351.21 &  2358.95 \\
A06           &    42.0 &   10.71 &   32.77 &    0.00 &    0.51 &    5.90 &   10.42 &   215.48 \\
A07           &   198.0 &    7.89 &   15.50 &    0.00 &    0.70 &    5.38 &    6.89 &   137.55 \\
A10           &   321.0 &    6.61 &    7.49 &    0.00 &    0.72 &    2.68 &   12.96 &    61.70 \\
A11           &    36.0 &    5.67 &    4.49 &    1.37 &    1.99 &    4.23 &    7.63 &    21.15 \\
A12           &   302.0 &    1.32 &    2.00 &    0.00 &    0.09 &    0.60 &    1.

### Gerenic Price

Following the same approach as before

In [17]:
df_descriptive_act_gerenic = df_descriptive_act.pivot(index = ['unique_id', 'running_count_event'], columns = 'ATC (level 2)')['price_per_unit_adj_g']
df_descriptive_act_gerenic = df_descriptive_act_gerenic.reset_index()
df_descriptive_act_gerenic = df_descriptive_act_gerenic.describe()
df_descriptive_act_gerenic = df_descriptive_act_gerenic.drop(columns = ['running_count_event'])
df_descriptive_act_gerenic = df_descriptive_act_gerenic.T
df_descriptive_act_gerenic = df_descriptive_act_gerenic.round(2)
print(df_descriptive_act_gerenic.to_latex(index = True)) 

\begin{tabular}{lrrrrrrrr}
\toprule
{} &   count &    mean &     std &    min &     25\% &     50\% &     75\% &      max \\
ATC (level 2) &         &         &         &        &         &         &         &          \\
\midrule
A01           &   317.0 &    2.89 &    4.91 &   0.04 &    0.32 &    0.84 &    2.73 &    24.74 \\
A02           &   236.0 &    4.91 &   15.64 &   0.38 &    1.21 &    1.85 &    3.46 &   145.73 \\
A04           &    51.0 &  150.51 &   60.53 &  42.81 &  106.08 &  144.32 &  192.59 &   259.81 \\
A06           &    42.0 &    2.35 &    4.05 &   0.00 &    0.00 &    0.10 &    3.12 &    15.18 \\
A07           &   198.0 &    6.38 &   12.75 &   0.26 &    0.48 &    3.82 &    4.93 &    97.19 \\
A10           &   321.0 &    2.32 &    3.25 &   0.14 &    0.32 &    0.65 &    3.42 &    27.37 \\
A11           &    36.0 &    5.87 &    1.11 &   3.47 &    4.98 &    5.87 &    6.85 &     7.49 \\
A12           &   302.0 &    1.01 &    1.46 &   0.00 &    0.25 &    0.57 &    1.11 &     7