# wongds_prescriber_rx
A Python-based approach for a Tufts Biomedical Data Science Club project to determine whether patterns exist between prescriber demographic information and opioid prescription.

## Datasets
- Prescriber attributes: https://www.kaggle.com/roamresearch/prescriptionbasedprediction
- Opiate prescriptions: https://www.kaggle.com/apryor6/us-opiate-prescriptions

## Packages/Libraries
### baseplot
Tool for plotting maps in Python. Install:
GEOS library: http://trac.osgeo.org/osgeo4w/
```
pip install https://github.com/matplotlib/basemap/archive/v1.0.7rel.tar.gz
````

In [1]:
# Load some useful libraries
import csv
import jsonlines
import copy
from pprint import pprint
import pandas as pd
import numpy as np

In [2]:
# Load data
# Opiate dataset
opiate_class_data = 'dataset_rx/opioids.csv'
opiate_prescriber_data = 'dataset_rx/prescriber-info.csv'
opiate_overdose_data = 'dataset_rx/overdoses.csv'

# List of opiates
opioids = pd.read_csv(opiate_class_data)

# Opiate prescriber info
opiate_prescribers = pd.read_csv(opiate_prescriber_data)

# Opiate overdoses, by US State
opiate_overdoses = pd.read_csv(opiate_overdose_data)

In [62]:
# Prescriber attribute dataset
prescriber_attr_dataset = 'dataset_prescriber/roam_prescription_based_prediction.jsonl'

# Build a list of dictionaries containing prescriber data, and dict of provider variables
prescriber_attr_data = []
prescriber_var_data = {'npi': []}
with jsonlines.open(prescriber_attr_dataset) as reader:
    for obj in reader:
        prescriber_attr_data.append(obj)
        prescriber_var_data['npi'].append(obj['npi'])
        for key, value in obj['provider_variables'].items():
            try:
                prescriber_var_data[key].append(value)
            except KeyError:
                prescriber_var_data[key] = [value]



In [83]:
prescriber_info_df = pd.DataFrame(prescriber_var_data)

In [4]:
# Look for prescription of opiates in prescriber_data
opioid_prescribers = {'Drug Name': [],
                      'npi': [],
                     }

# Cycle through each prescriber record
for prescriber in prescriber_attr_data:
    # Look for opioids by Drug Name, and make a record of prescriber by npi.
    # Generic names also appear in 'Drug Name' list.
    for drug in opioids['Drug Name']:
        if drug in prescriber['cms_prescription_counts'].keys():
            opioid_prescribers['Drug Name'].append(drug)
            opioid_prescribers['npi'].append(prescriber['npi'])

# Convert the dictionary to a dataframe and save to csv.
# Use this dataframe to unify prescriber attributes and opioid prescription.
opioid_prescriber_df = pd.DataFrame(opioid_prescribers)
opioid_prescriber_df.to_csv('opioid_prescribers.csv')

#print(opioids)
#print(opioids['Drug Name'])
#print(opioids['Generic Name'])
#print(prescriber_data[0]['cms_prescription_counts'].keys())

In [104]:
# Grab unique NPI numbers from opioid prescribers
prescriber_npi = set(opioid_prescriber_df["npi"])
prescriber_info = {'npi': []}

for key in prescriber_attr_data[0]['provider_variables'].keys():
    prescriber_info[key] = []

# Pull provider attributes out of provider variables dict
id_num = '1780689786'
prescriber_info_df.loc[prescriber_info_df['npi'] == id_num, ].to_dict('list')



#for id_num in prescriber_npi:
#    for key, value in prescriber_info_df.loc[prescriber_info_df['npi'] == id_num, ].to_dict('list'):
#        prescriber_info[key].append[value[0]]

{'1295088748', '1568504660', '1114029451', '1003905860', '1255394482', '1558550210', '1467577304', '1003921099', '1922073568', '1881672681', '1295959724', '1174518872', '1497732655', '1265403752', '1528337219', '1760426548', '1215134267', '1093700981', '1861645830', '1194713669', '1417147877', '1669552642', '1831334028', '1538329990', '1932116001', '1487611505', '1861540726', '1356428700', '1316124084', '1689651036', '1184600512', '1972528479', '1568626265', '1487661336', '1508952045', '1629103759', '1225065204', '1750499653', '1225066608', '1679589550', '1396957544', '1124046776', '1881660363', '1871788067', '1134335672', '1033484563', '1124057476', '1578530705', '1235195595', '1255339917', '1811986243', '1356320600', '1811903149', '1811910284', '1568512507', '1891794277', '1710182209', '1134192909', '1689958902', '1124249750', '1407858632', '1710972948', '1770522989', '1699878769', '1316993199', '1750517512', '1336201037', '1912907437', '1023113784', '1437185337', '1205980984', '1710

In [None]:
# Don't do this. The dataset is huge. Total of 239,930 prescribers.
# Set up column names for Pandas DataFrame
#column_names = ['drug_name', 'count', 'npi']
#column_names.extend(list(prescriber_data[0]['provider_variables'].keys()))

# Set up the dataframe
#prescriptions = pd.DataFrame(np.nan, index=[], columns=column_names)
#print(prescriptions)

# Re-format JSONL data to Pandas dataframe for CSV, one line for each prescriber/drug prescrbed combination
#for prescriber_record in prescriber_data:
    # Prescriber information
#    prescriber_info = {'npi': prescriber_record['npi']}
#    prescriber_info.update(prescriber_record['provider_variables'])
    
    # Drug information
    #drug_names = list(prescriber_record['cms_prescription_counts'].keys())
    #drug_counts = list(prescriber_record['cms_prescription_counts'].values())
    
    # Make a dictionary of lists to hold data for this provider
    ## Expand prescriber data so each drug prescribed is attached to a row of data containing prescriber info
#    drug_data = {'drug_name': [],
#                 'count': [],
#                 'npi': [],
#                 'settlement_type': [],
#                 'region': [],
#                 'specialty': [],
#                 'gender': [],
#                 'generic_rx_count': [],
#                 'brand_name_rx_count': [],
#                 'years_practicing': []
#                }
    ## Fill the dictionary lists
#    for dname, dcount in prescriber_record['cms_prescription_counts'].items():
        # Drug information
#        drug_data['drug_name'].append(dname)
#        drug_data['count'].append(dcount)
        
        # Prescriber information
#        for key in prescriber_info.keys():
#            drug_data[key].append(prescriber_info[key])
            
    # Use the dictionary to make a dataframe
#    provider_df = pd.DataFrame(drug_data)
    #print(provider_df)
    
    # Add the dataframe for this provider to the end of the larger prescriptions dataframe
#    prescriptions = pd.concat([prescriptions, provider_df], ignore_index=True)
#    prescriptions.to_csv('prescriptions.csv')

#prescriptions.to_csv('prescriptions.csv')
#print(prescriptions)
    
#    The code below is too inefficient for this dataset
#    for drug_name, count in entry['cms_prescription_counts'].items():
#        data_row = {column_names[0]: drug_name,
#                    column_names[1]: count,
#                    column_names[2]: prescriber_info['npi'],
#                    column_names[3]: prescriber_info['settlement_type'],
#                    column_names[4]: prescriber_info['region'],
#                    column_names[5]: prescriber_info['specialty'],
#                    column_names[6]: prescriber_info['gender'],
#                    column_names[7]: prescriber_info['generic_rx_count'],
#                    column_names[8]: prescriber_info['brand_name_rx_count'],
#                    column_names[9]: prescriber_info['years_practicing'],
#                   }
        #print(data_row, column_names)
#        dataframe_row = pd.DataFrame(data_row,
#                                     index=[1],
#                                     columns=column_names)
        #print(dataframe_row)
#        prescriptions = pd.concat([prescriptions, dataframe_row], ignore_index=True)
#prescriptions.to_csv('prescriptions.csv')        
#print(prescriptions)

In [None]:
# Slice the data in a few different ways
# List all of the drugs prescribed
drugs_prescribed = {}
for entry in prescriber_data:
    for drug_name, count in entry['cms_prescription_counts'].items():
        try:
            drugs[drug_name] += count
        except KeyError:
            drugs_prescribed[drug_name] = (count)
            

In [None]:
len(prescriber_data)


In [None]:
# Build a dictionary of drugs prescribed, with values being npi numbers for providers who prescribed
