In [4]:
## Import modules 
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

## read data 
CIP_data = pd.read_csv("CIP_Resistant_disagregated.csv")

print(CIP_data.columns)

Index(['Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP', 'Susceptible'], dtype='object')


In [6]:
## Multivariable model with X1 = CLINIC and X2 = GENDERSP
X1 = CIP_data[["CLINIC"]]
X2 = CIP_data[["GENDERSP"]]
y = CIP_data[["Susceptible"]]


encoder_gen = OneHotEncoder()
encoder_categories_gend  = encoder_gen.fit(CIP_data[['GENDERSP']]).categories_
encoder_categories_gend  = encoder_categories_gend[0].tolist()
encoder_df_gend  = pd.DataFrame(encoder_gen.fit_transform(CIP_data[['GENDERSP']]).toarray())
CIP_data_encoded_gend = CIP_data.join(encoder_df_gend)
print(CIP_data.columns)
print(encoder_categories_gend[0:])
col_names = list(CIP_data.columns) + encoder_categories_gend[0:]
#print(col_names)
CIP_data_encoded_gend.columns = col_names
#
print(CIP_data_encoded_gend.head())


Index(['Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP', 'Susceptible'], dtype='object')
['MSM', 'MSMW', 'MSW', 'Oth/Unk/Missing']
   Unnamed: 0 CLINIC  YEAR GENDERSP  Susceptible  MSM  MSMW  MSW  \
0           0    ALB  2000      MSW            1  0.0   0.0  1.0   
1           0    ALB  2000      MSW            1  0.0   0.0  1.0   
2           0    ALB  2000      MSW            1  0.0   0.0  1.0   
3           0    ALB  2000      MSW            1  0.0   0.0  1.0   
4           0    ALB  2000      MSW            1  0.0   0.0  1.0   

   Oth/Unk/Missing  
0              0.0  
1              0.0  
2              0.0  
3              0.0  
4              0.0  


In [7]:
west = ['POR', 'PHX', 'HON', 'SDG', 'SFO', 'ANC', 'SEA', 'DEN', 'LVG', 'ORA', 'LBC', 'SLC', 'LAX']
southwest = ['OKC','MIN', 'ALB', 'DAL']
midwest = ['KCY','CHI', 'PON', 'CIN', 'JAC', 'IND', 'STL','DTR', 'MIL', 'COL', 'CLE']
southeast = ['GRB', 'NOR','WDC','MIA', 'BHM','FBG','ATL', 'RIC']
northeast = ['BUF','BOS', 'CAM', 'NYC', 'BAL', 'PHI']

### One-hot encoding 
CIP_data_encoded_gend['REGION'] = CIP_data_encoded_gend['CLINIC'].apply(lambda x: 
    'West' if (x in west) else (
        'Southwest' if (x in southwest) else(
            'Midwest' if (x in midwest) else(
                'Southeast' if (x in southeast) else(
                    'Northeast' if (x in northeast) else 'Other'))))) 

#print(CIP_data_encoded_gend.head())


encoder_region = OneHotEncoder()
encoder_categories_region  = encoder_region.fit(CIP_data_encoded_gend[['REGION']]).categories_
encoder_categories_region  = encoder_categories_region[0].tolist()
encoder_df_region = pd.DataFrame(encoder_region.fit_transform(CIP_data_encoded_gend[['REGION']]).toarray())
CIP_data_encoded_gend_region = CIP_data_encoded_gend.join(encoder_df_region)
col_names = list(CIP_data_encoded_gend.columns) + encoder_categories_region[0:]
#print(col_names)
CIP_data_encoded_gend_region.columns = col_names
print(CIP_data_encoded_gend_region.head())

   Unnamed: 0 CLINIC  YEAR GENDERSP  Susceptible  MSM  MSMW  MSW  \
0           0    ALB  2000      MSW            1  0.0   0.0  1.0   
1           0    ALB  2000      MSW            1  0.0   0.0  1.0   
2           0    ALB  2000      MSW            1  0.0   0.0  1.0   
3           0    ALB  2000      MSW            1  0.0   0.0  1.0   
4           0    ALB  2000      MSW            1  0.0   0.0  1.0   

   Oth/Unk/Missing     REGION  Midwest  Northeast  Southeast  Southwest  West  
0              0.0  Southwest      0.0        0.0        0.0        1.0   0.0  
1              0.0  Southwest      0.0        0.0        0.0        1.0   0.0  
2              0.0  Southwest      0.0        0.0        0.0        1.0   0.0  
3              0.0  Southwest      0.0        0.0        0.0        1.0   0.0  
4              0.0  Southwest      0.0        0.0        0.0        1.0   0.0  


In [9]:
##### NB here the "Susceptible" means "Susceptible to CIPRO", so 1-susceptible is resistance 
suscep_by_region = (CIP_data_encoded_gend_region.groupby(by = ["REGION"])["Susceptible"].sum())
print(suscep_by_region)

regions = CIP_data_encoded_gend_region["REGION"].unique()
years = CIP_data_encoded_gend_region["YEAR"].unique()
prevalence_by_region = {}
for region in regions:
    print(region)
    print(suscep_by_region[region])
    prevalence_by_region[region] = suscep_by_region[region]/len(CIP_data_encoded_gend_region["REGION"] == region)
#/CIP_data_encoded_gend_region.groupby(by = ["REGION"])["Susceptible"].size()

print(prevalence_by_region)

REGION
Midwest      23475
Northeast    11155
Southeast    17545
Southwest    12839
West         31175
Name: Susceptible, dtype: int64
Southwest
12839
West
31175
Southeast
17545
Northeast
11155
Midwest
23475
{'Southwest': 0.11413763368211438, 'West': 0.27714313654022243, 'Southeast': 0.15597357916914845, 'Northeast': 0.09916701485504992, 'Midwest': 0.2086907820459253}


In [10]:
regions = CIP_data_encoded_gend_region["REGION"].unique()
years = CIP_data_encoded_gend_region["YEAR"].unique()
for year in years:
    CIP_data_year = CIP_data_encoded_gend_region.loc[CIP_data_encoded_gend_region['YEAR'] == year]
    suscep_by_region = (CIP_data_year.groupby(by = ["REGION"])["Susceptible"].sum())
    
    prevalence_by_region = {}
    for region in regions:
            prevalence_by_region[region] = 1 - suscep_by_region[region]/len(CIP_data_year[CIP_data_year.REGION == region])
            #CIP_data_encoded_gend_region["PREV_REGION"] = np.where((CIP_data_encoded_gend_region["YEAR"] == year) & (CIP_data_encoded_gend_region["REGION"] == region), prevalence_by_region[region])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = 0
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = prevalence_by_region[region]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year + 1) & (CIP_data_encoded_gend_region["REGION"] == region), "PREV_REGION"] = prevalence_by_region[region] ### Add 1 to enter in next year's prevalence


print(CIP_data_encoded_gend_region["PREV_REGION"])

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
            ...   
112482    0.282092
112483    0.282092
112484    0.282092
112485    0.282092
112486    0.282092
Name: PREV_REGION, Length: 112487, dtype: float64


In [11]:
### Add column for prevalence by clinic

#clinics = CIP_data_encoded_gend_region["CLINIC"].unique() not every clinic is every year 
years = CIP_data_encoded_gend_region["YEAR"].unique()

for year in years:
    CIP_data_year = CIP_data_encoded_gend_region.loc[CIP_data_encoded_gend_region['YEAR'] == year]
    clinics = CIP_data_year["CLINIC"].unique()

    suscep_by_clinic = (CIP_data_year.groupby(by = ["CLINIC"])["Susceptible"].sum())
    prevalence_by_clinic = {}
    for clinic in clinics:
            prevalence_by_clinic[clinic] = 1 - suscep_by_clinic[clinic]/len(CIP_data_year[CIP_data_year.CLINIC == clinic])
            #CIP_data_encoded_gend_region["PREV_REGION"] = np.where((CIP_data_encoded_gend_region["YEAR"] == year) & (CIP_data_encoded_gend_region["REGION"] == region), prevalence_by_region[region])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = 0
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = prevalence_by_clinic[clinic]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year + 1) & (CIP_data_encoded_gend_region["CLINIC"] == clinic), "PREV_CLINIC"] = prevalence_by_clinic[clinic]

CIP_data_encoded_gend_region["PREV_CLINIC"] = CIP_data_encoded_gend_region["PREV_CLINIC"].fillna(0) # gets rid of issues where previous year the clinic wasn't monitored
print(CIP_data_encoded_gend_region["PREV_CLINIC"])

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
112482    0.5
112483    0.5
112484    0.5
112485    0.5
112486    0.5
Name: PREV_CLINIC, Length: 112487, dtype: float64


In [12]:
### write new data 
print(CIP_data_encoded_gend_region["PREV_CLINIC"].isnull().values.any())
CIP_data_encoded = CIP_data_encoded_gend_region ## reassign for sake of straighforwardness 

#CIP_data_encoded.to_csv("CIP_data_encode_prev_not_dropped.csv")

False


In [11]:
## add change in prevalence year on year in clinic 
CIP_data_encoded_gend_region = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")



#clinics = CIP_data_encoded_gend_region["CLINIC"].unique() not every clinic is every year 
years = CIP_data_encoded_gend_region["YEAR"].unique()

for year in years:
    CIP_data_year = CIP_data_encoded_gend_region.loc[CIP_data_encoded_gend_region['YEAR'] == year]
    regions = CIP_data_year["REGION"].unique()

    suscep_by_region_current = (CIP_data_year.groupby(by = ["REGION"])["Susceptible"].sum())
    prevalence_by_region_current = {}
    for region in regions:
            prevalence_by_region_current[region] = 1 - suscep_by_region_current[region]/len(CIP_data_year[CIP_data_year.REGION == region])
            if year == 2000:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2000) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_CLINIC"] = 0
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == 2001) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_CLINIC"] = prevalence_by_region_current[region]

            else:
                CIP_data_encoded_gend_region.loc[(CIP_data_encoded_gend_region["YEAR"] == year + 1) & (CIP_data_encoded_gend_region["REGION"] == region), "DELTA_CLINIC"] = prevalence_by_region_current[region] - prevalence_by_region_previous[region]
    prevalence_by_region_previous = prevalence_by_region_current

In [12]:
print(CIP_data_encoded_gend_region["DELTA_CLINIC"])

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
            ...   
112482   -0.016862
112483   -0.016862
112484   -0.016862
112485   -0.016862
112486   -0.016862
Name: DELTA_CLINIC, Length: 112487, dtype: float64
