In [19]:
## Import modules 

import pandas as pd
from sklearn import preprocessing
from Functions_AMR_gonorrhea import encoder_for_GISP
from sklearn.preprocessing import OneHotEncoder


## read data 
CIP_data = pd.read_csv("CIP_Resistant_disagregated.csv")

print(CIP_data.columns)

Index(['Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP', 'Susceptible'], dtype='object')


In [20]:
## Categories for location of clinic
west = ['POR', 'PHX', 'HON', 'SDG', 'SFO', 'ANC', 'SEA', 'DEN', 'LVG', 'ORA', 'LBC', 'SLC', 'LAX']
southwest = ['OKC','MIN', 'ALB', 'DAL']
midwest = ['KCY','CHI', 'PON', 'CIN', 'JAC', 'IND', 'STL','DTR', 'MIL', 'COL', 'CLE']
southeast = ['GRB', 'NOR','WDC','MIA', 'BHM','FBG','ATL', 'RIC']
northeast = ['BUF','BOS', 'CAM', 'NYC', 'BAL', 'PHI']

### Add "Region" column
CIP_data['REGION'] = CIP_data['CLINIC'].apply(lambda x: 
    'West' if (x in west) else (
        'Southwest' if (x in southwest) else(
            'Midwest' if (x in midwest) else(
                'Southeast' if (x in southeast) else(
                    'Northeast' if (x in northeast) else 'Other'))))) 



In [21]:
## One-hot encoding 
CIP_data_encoded_gend_region = encoder_for_GISP(CIP_data, 'GENDERSP')
CIP_data_encoded_gend_region = encoder_for_GISP(CIP_data_encoded_gend_region, 'REGION')

In [22]:
## Prevalence by region and clinic
regions = CIP_data_encoded_gend_region["REGION"].unique()
years = CIP_data_encoded_gend_region["YEAR"].unique()
for year in years:
    CIP_data_year = CIP_data_encoded_gend_region.loc[CIP_data_encoded_gend_region['YEAR'] == year]
    suscep_by_region = (CIP_data_year.groupby(by = ["REGION"])["Susceptible"].sum())
    suscep_by_clinic = (CIP_data_year.groupby(by = ["CLINIC"])["Susceptible"].sum())
    clinics = CIP_data_year["CLINIC"].unique() ## each year surveyed different clinics
    prevalence_by_clinic = {}
    prevalence_by_region = {}
    for region in regions:
            prevalence_by_region[region] = 1 - suscep_by_region[region]/len(CIP_data_year[CIP_data_year.REGION == region])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = 0
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = prevalence_by_region[region]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year + 1) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = prevalence_by_region[region] ### Add 1 to enter in next year's prevalence

    for clinic in clinics:
            prevalence_by_clinic[clinic] = 1 - suscep_by_clinic[clinic]/len(CIP_data_year[CIP_data_year.CLINIC == clinic])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = 0
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = prevalence_by_clinic[clinic]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year + 1) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = prevalence_by_clinic[clinic]

CIP_data_encoded_gend_region["PREV_CLINIC"] = CIP_data_encoded_gend_region["PREV_CLINIC"].fillna(0) # gets rid of issues where previous year the clinic wasn't monitored


In [23]:
## Change in prevalence by region and clinic 

for year in years:
    CIP_data_year = CIP_data_encoded_gend_region.loc[CIP_data_encoded_gend_region['YEAR'] == year]
    clinics = CIP_data_year["CLINIC"].unique()
    suscep_by_clinic_current = (CIP_data_year.groupby(by = ["CLINIC"])["Susceptible"].sum())
    suscep_by_region_current = (CIP_data_year.groupby(by = ["REGION"])["Susceptible"].sum())

    prevalence_by_clinic_current = {}
    prevalence_by_region_current = {}

    for region in regions:
            prevalence_by_region_current[region] = 1 - suscep_by_region_current[region]/len(CIP_data_year[CIP_data_year.REGION == region])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_REGION"] = 0
                #CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_REGION"] = prevalence_by_region_current[region]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_REGION"] = prevalence_by_region_current[region] - prevalence_by_region_previous[region]
    for clinic in clinics:
            prevalence_by_clinic_current[clinic] = 1 - suscep_by_clinic_current[clinic]/len(CIP_data_year[CIP_data_year.CLINIC == clinic])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "DELTA_CLINIC"] = 0

            else:
                try: 
                    CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "DELTA_CLINIC"] = prevalence_by_clinic_current[clinic] - prevalence_by_clinic_previous[clinic]
                except KeyError:
                     CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "DELTA_CLINIC"] = 0
    
    prevalence_by_region_previous = prevalence_by_region_current

    prevalence_by_clinic_previous = prevalence_by_clinic_current


In [24]:
### Standardize the continuous variables

X_train_continuous = CIP_data_encoded_gend_region[["PREV_REGION", "PREV_REGION", "DELTA_REGION", "DELTA_CLINIC"]]
scaler = preprocessing.StandardScaler().fit(X_train_continuous)
CIP_data_encoded_gend_region[["PREV_REGION", "PREV_REGION", "DELTA_REGION", "DELTA_CLINIC"]] = scaler.transform(X_train_continuous)

In [25]:
# See whether estimates are based on >75 observations
counts = CIP_data_encoded_gend_region.groupby(['YEAR', 'CLINIC']).size().reset_index(name='Count')

counts['Count_Exceeds_75'] = (counts['Count'] > 75) * 1


counts = counts.sort_values(['CLINIC', 'YEAR'], ascending=True)
counts['Trend_N_greater_75'] = 0

counts['Trend_N_greater_75'] = counts['Count_Exceeds_75'].shift(-1)

counts.loc[counts['CLINIC'] != counts['CLINIC'].shift(-1), 'Trend_N_greater_75'] = 0
 
CIP_data_encoded_gend_region = CIP_data_encoded_gend_region.merge(counts[['YEAR', 'CLINIC', 'Count_Exceeds_75', 'Trend_N_greater_75']], on=['YEAR', 'CLINIC'], how='left')


In [26]:
CIP_data_encoded_gend_region

Unnamed: 0.1,Unnamed: 0,CLINIC,YEAR,GENDERSP,Susceptible,REGION,MSM,MSMW,MSW,Oth/Unk/Missing,...,Northeast,Southeast,Southwest,West,PREV_REGION,PREV_CLINIC,DELTA_REGION,DELTA_CLINIC,Count_Exceeds_75,Trend_N_greater_75
0,0,ALB,2000,MSW,1,Southwest,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,-1.153862,0.0,-0.629490,-0.290402,1,1.0
1,0,ALB,2000,MSW,1,Southwest,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,-1.153862,0.0,-0.629490,-0.290402,1,1.0
2,0,ALB,2000,MSW,1,Southwest,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,-1.153862,0.0,-0.629490,-0.290402,1,1.0
3,0,ALB,2000,MSW,1,Southwest,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,-1.153862,0.0,-0.629490,-0.290402,1,1.0
4,0,ALB,2000,MSW,1,Southwest,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,-1.153862,0.0,-0.629490,-0.290402,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112482,1968,WDC,2019,MSMW,0,Southeast,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.410956,0.5,0.676319,-1.731806,1,0.0
112483,1968,WDC,2019,MSMW,0,Southeast,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.410956,0.5,0.676319,-1.731806,1,0.0
112484,1968,WDC,2019,MSMW,0,Southeast,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.410956,0.5,0.676319,-1.731806,1,0.0
112485,1968,WDC,2019,MSMW,0,Southeast,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.410956,0.5,0.676319,-1.731806,1,0.0


In [27]:
## Write data
CIP_data_encoded_gend_region.to_csv("CIP_data_encode_prev_not_dropped.csv")