In [43]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',
    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }}
);

<IPython.core.display.Javascript object>

In [44]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from loess.Loess import Loess
from sklearn.neighbors.kde import KernelDensity
import statsmodels.api as sm
import seaborn as sns
import loess
import random
import os

In [45]:
# Defining Virus tramission temperarture range(in Fahrenheit)
a=60
b=95
# Converting Fahrenheit to Kelvin
a=(a-32)*5/9 + 273
b=(b-32)*5/9 + 273

In [46]:
# Path to the file with WNV data
wnv_file_path= "/Users/sparshagarwal/Downloads/WMV_data/Arbovirus_risk_modeling_US/WNV_human_cases/WNV_NI_NNI_1999to2015_prevalence_incidence_final_20180530.csv"

In [47]:
months=["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
years=range(1999, 2016)
summer_mon=["05", "06", "07", "08", "09"]   #Summer months for WNV transmission
months_desc={"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr", "05": "May", "06": "Jun", "07" :"Jul", "08": "Aug", "09": "Sep", "10": "Oct", "11" : "Nov", "12" : "Dec" }

In [48]:
# Defining socio-economic zones/regions
socio_regions={"WA":1, "OR":1, "CA":1, "MT":2, "ID":2, "WY":2, "NV":2, "UT":2, "CO":2, "AZ":2, "NM":2,"ND":3, "SD":3, "MN":3, "NE":3, "IA":3,"KS":3, "MO":3, "OK":4 , "TX":4, "AR":4, "LA":4, "WI":5, "MI":5, "IL":5, "IN":5, "OH":5, "KY":6, "TN":6, "AL":6, "MS":6, "WV":7, "VA":7, "NC":7, "SC":7, "DC":7, "MD":7, "DE":7, "GA":7, "FL":7, "PA":8, "NJ":8, "NY":8, "MA":9, "CT":9, "NH":9, "VT":9, "ME":9, "RI":9}
socio_key={1:"Pacific", 2:"Mountain", 3:"West North Central", 4: "West South Central", 5: "East North Central", 6: "East South Central", 7: "South Atlantic", 8: "Middle Atlantic", 9: "New England"}

In [49]:
# Defining climatic zones/regions
weather_regions={"WA":1, "OR":1, "ID":1, "NV":2, "CA":2, "MT":3, "WY":3,"ND":3, "SD":3, "NE":3,"UT":4, "CO":4, "AZ":4, "NM":4, "MN":5,"WI":5, "MI":5, "IA":5,"KS":6, "OK":6 , "TX":6, "AR":6, "LA":6, "MS":6, "MO":7 , "IL":7, "IN":7, "OH":7, "KY":7, "TN":7, "WV":7, "AL":8, "VA":8, "NC":8, "SC":8, "GA":8, "FL":8, "DE":9, "DC":9, "MD":9, "PA":9, "NJ":9, "NY":9, "MA":9, "CT":9, "NH":9, "VT":9, "ME":9, "RI":9}
weather_key={1: "Northwest", 2: "West", 3: "West North Central", 4: "Southwest", 5: "East North Central", 6: "South", 7: "Central", 8: "Southeast", 9:"Northeast"}

In [50]:
#Importing WNV yearly data and contructing a dataframe for it.
df= pd.read_csv(wnv_file_path, encoding='latin-1')
df=df[df["Select_County"]==1]
df.rename(columns = {'GEOID10':'GEOID'}, inplace = True)
df=df[df["STNAME"]!="California"]

In [51]:
# Adding a column for incident year (first year of WNV introduction) in the previous dataframe
incident_year=[years[-1] for i in range(len(df))]
for j in range(len(df)):
    for i in years:
        if(df.iloc[j]["NI_IR_" + str(i)]!=0):
            incident_year[j]=i
            break
df["Incident_year"]=incident_year

This section deals with processing of 3 weather variables: temperature, precipitation and humidity

In [52]:
# Path to the file with weather (temperature, precipitation and humidity) data
weather_data_path="/Users/sparshagarwal/Downloads/WMV_data/intersections/narr_urban_county_data_masked/"

In [53]:
# Variables in the data
variables= ["air.2m", "air.sfc","apcp", "rhum.2m"]

In [54]:
# Adding temp, precp, and humidity data
df_mod=df.copy()
for i in variables:
    for j in years:
        for k in range(len(months)):
            #Importing monthly Variable data
            data= pd.read_csv(weather_data_path + i +"_masked/"+ str(j) + "_" + months[k] + "_masked.csv")
            data=data[["GEOID", "mean"]]
            data.rename(columns = {'mean': i + '_' + str(j) + "_" + months[k]}, inplace = True)
            df_mod=df_mod.join(data.set_index('GEOID'), on='GEOID')

In [55]:
#Adding socioeconomic and weather regions classification column
sc=[]
we=[]
for i in range(len(df_mod)):
    sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
    we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
df_mod["Socio_econ_class"]=sc 
df_mod["Weather_class"]=we

Calculating the values by taking average across all the years

In [56]:
#Calculating average of mean summer temperature, precipitaion, humidity,  for every county and storing 
#all the corresponding attributes in different lists so that a simplified dataframe can be made.
#Months considered with temperature lying in CDD_temp range.
geo_id=[]
mean_temp_2m=[]
mean_temp_sfc=[]
mean_prec=[]
mean_hum=[]
mean_cp=[]
mean_ir=[]
w_cl=[]
s_cl=[]
for i in range(len(df_mod)):
    summer_temp_2m=[]
    summer_temp_sfc=[]    
    summer_prec=[]
    summer_cp=[]
    summer_hum=[]
    summer_ir=[]
    inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence
                                     
    for j in range(inc_year+1, years[-1]+1):
        cdd_mon=[]
        for k in months:
            if(df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]>=a and df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]<=b):
                cdd_mon.append(k)

        avg_temp_2m=[]
        avg_temp_sfc=[]    
        avg_prec=[]
        avg_hum=[]
        for k in cdd_mon:
            #Storing summer temp/prec/hum values for a certain year.
            avg_temp_2m.append(df_mod.iloc[i][variables[0] + "_" + str(j) + "_" + k] -273)
            avg_temp_sfc.append(df_mod.iloc[i][variables[1] + "_" + str(j) + "_" + k] -273)
            avg_prec.append(df_mod.iloc[i][variables[2] + "_" + str(j) + "_" + k])
            avg_hum.append(df_mod.iloc[i][variables[3] + "_" + str(j) + "_" + k])
        #Calculating and storing the average summer temp/prec/hum value for a certain year.
        summer_temp_2m.append(np.mean(avg_temp_2m))    
        summer_temp_sfc.append(np.mean(avg_temp_sfc))
        summer_prec.append(np.mean(avg_prec))
        summer_cp.append(np.sum(avg_prec))      #For calculating cumulative precipitation
        summer_hum.append(np.mean(avg_hum))
        summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])
                                   
    if(inc_year!=years[-1]):
        # Calculating mean of varaibles across all the years
        mean_temp_2m.append(np.mean(summer_temp_2m))
        mean_temp_sfc.append(np.mean(summer_temp_sfc))
        mean_prec.append(np.mean(summer_prec))
        mean_hum.append(np.mean(summer_hum))
        mean_cp.append(np.mean(summer_cp))
        mean_ir.append(np.mean(summer_ir))
        geo_id.append(df_mod.iloc[i]["GEOID"])
        w_cl.append(df_mod.iloc[i]["Weather_class"])
        s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
        

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [57]:
#Making a final datframe containing values for each variable
df_combined=pd.DataFrame()
df_combined["GEOID"]=geo_id
df_combined["IR"]=mean_ir
df_combined["Temp_2m"]=mean_temp_2m
df_combined["Prec"]=mean_prec
df_combined["Hum"]=mean_hum
df_combined["Weather_class"]=w_cl
df_combined["Socio_econ_class"]=s_cl
df_combined=df_combined[df_combined["Temp_2m"].isnull()==False]
df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
for var in variables:
    if(var=="air.sfc"):
        continue
    df_final=df_combined.copy()
    if(var=="air.2m"):
        var_name="Temp_2m"
        df_final.drop(columns=["Prec","Hum"], inplace=True)
    if(var=="apcp"):
        var_name="Prec"
        df_final.drop(columns=["Temp_2m","Hum"], inplace=True)
    if(var=="rhum.2m"):
        var_name="Hum"
        df_final.drop(columns=["Temp_2m","Prec"], inplace=True)
    #To export data
    df_final.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_all_years_" + var_name +".csv", index=False)

Calculating the values by taking random sample years among all the years

In [58]:
n_sample_years=5    #Number of years to be sampled

In [59]:
#Calculating average of mean summer temperature, precipitaion, humidity,  for every county and storing 
#all the corresponding attributes in different lists so that a simplified dataframe can be made.
#Months considered with temperature lying in CDD_temp range. Only "n_sample_years" number of years data is considered.
geo_id=[]
mean_temp_2m=[]
mean_temp_sfc=[]
mean_prec=[]
mean_hum=[]
mean_ir=[]
w_cl=[]
s_cl=[]
for i in range(len(df_mod)):
    summer_temp_2m=[]
    summer_temp_sfc=[]    
    summer_prec=[]
    summer_hum=[]
    summer_ir=[]
    inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence
    
    sample_years=[]    # For storing n number of sample years
    for rand in range(inc_year+1, years[-1]+1):
        sample_years.append(rand)
    random.seed(10)
    if(len(sample_years)>n_sample_years):
        sample_years=random.sample(sample_years,n_sample_years)
        
    for j in sample_years:
        cdd_mon=[]
        for k in months:
            if(df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]>=a and df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]<=b):
                cdd_mon.append(k)

        avg_temp_2m=[]
        avg_temp_sfc=[]    
        avg_prec=[]
        avg_hum=[]
        for k in cdd_mon:
            #Storing summer temp/prec/hum values for a certain year.
            avg_temp_2m.append(df_mod.iloc[i][variables[0] + "_" + str(j) + "_" + k] -273)
            avg_temp_sfc.append(df_mod.iloc[i][variables[1] + "_" + str(j) + "_" + k] -273)
            avg_prec.append(df_mod.iloc[i][variables[2] + "_" + str(j) + "_" + k])
            avg_hum.append(df_mod.iloc[i][variables[3] + "_" + str(j) + "_" + k])
        #Calculating and storing the average summer temp/prec/hum value for a certain year.
        summer_temp_2m.append(np.mean(avg_temp_2m))    
        summer_temp_sfc.append(np.mean(avg_temp_sfc))
        summer_prec.append(np.mean(avg_prec))
        summer_hum.append(np.mean(avg_hum))
        summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])
                                   
    if(inc_year!=years[-1]):
        # Calculating mean of varaibles across all the sample years
        mean_temp_2m.append(np.mean(summer_temp_2m))
        mean_temp_sfc.append(np.mean(summer_temp_sfc))
        mean_prec.append(np.mean(summer_prec))
        mean_hum.append(np.mean(summer_hum))
        mean_ir.append(np.mean(summer_ir))
        geo_id.append(df_mod.iloc[i]["GEOID"])
        w_cl.append(df_mod.iloc[i]["Weather_class"])
        s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
        

In [60]:
#Making a final dataframe containing values for each variable
df_combined=pd.DataFrame()
df_combined["GEOID"]=geo_id
df_combined["IR"]=mean_ir
df_combined["Temp_2m"]=mean_temp_2m
df_combined["Prec"]=mean_prec
df_combined["Hum"]=mean_hum
df_combined["Weather_class"]=w_cl
df_combined["Socio_econ_class"]=s_cl
df_combined=df_combined[df_combined["Temp_2m"].isnull()==False]
df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
for var in variables:
    if(var=="air.sfc"):
        continue
    df_final=df_combined.copy()
    if(var=="air.2m"):
        var_name="Temp_2m"
        df_final.drop(columns=["Prec","Hum"], inplace=True)
    if(var=="apcp"):
        var_name="Prec"
        df_final.drop(columns=["Temp_2m","Hum"], inplace=True)
    if(var=="rhum.2m"):
        var_name="Hum"
        df_final.drop(columns=["Temp_2m","Prec"], inplace=True)
    #To export data
    df_final.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_sample_years_" + var_name +".csv", index=False)

Calculating the values by taking median years values

In [61]:
#Calculating average of mean summer temperature, precipitaion, humidity,  for every county and storing 
#all the corresponding attributes in different lists so that a simplified dataframe can be made.
#Months considered with temperature lying in CDD_temp range, not just May-Sept. Only median year values are considered.
for median_variable in variables:   # Variable used to calculate median IR
    geo_id=[]
    mean_temp_2m=[]
    mean_temp_sfc=[]
    mean_prec=[]
    mean_hum=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_temp_2m=[]
        summer_temp_sfc=[]    
        summer_prec=[]
        summer_hum=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        for j in range(inc_year+1, years[-1]+1):
            cdd_mon=[]
            for k in months:
                if(df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]>=a and df_mod.iloc[i][variables[0]+ "_" + str(j) + "_" + k]<=b):
                    cdd_mon.append(k)

            avg_temp_2m=[]
            avg_temp_sfc=[]    
            avg_prec=[]
            avg_hum=[]
            for k in cdd_mon:
                #Storing summer temp/prec/hum values for a certain year.
                avg_temp_2m.append(df_mod.iloc[i][variables[0] + "_" + str(j) + "_" + k] -273)
                avg_temp_sfc.append(df_mod.iloc[i][variables[1] + "_" + str(j) + "_" + k] -273)
                avg_prec.append(df_mod.iloc[i][variables[2] + "_" + str(j) + "_" + k])
                avg_hum.append(df_mod.iloc[i][variables[3] + "_" + str(j) + "_" + k])
            #Calculating and storing the average summer temp/prec/hum value for a certain year.
            summer_temp_2m.append(np.mean(avg_temp_2m))    
            summer_temp_sfc.append(np.mean(avg_temp_sfc))
            summer_prec.append(np.mean(avg_prec))
            summer_hum.append(np.mean(avg_hum))
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating median of variables across all the years
            if(median_variable==variables[0]):
                l=summer_temp_2m
            elif(median_variable==variables[1]):
                l=summer_temp_sfc
            elif(median_variable==variables[2]):
                l=summer_prec
            elif(median_variable==variables[3]):
                l=summer_hum
            index=np.argsort(l)[len(l)//2]       #Index of median value
            mean_temp_2m.append(summer_temp_2m[index])
            mean_temp_sfc.append(summer_temp_sfc[index])
            mean_prec.append(summer_prec[index])
            mean_hum.append(summer_hum[index])
            mean_ir.append(summer_ir[index])     #Median IR corresponding to the variable of interest

            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
            
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined["Temp_2m"]=mean_temp_2m
    df_combined["Prec"]=mean_prec
    df_combined["Hum"]=mean_hum
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    df_combined=df_combined[df_combined["Temp_2m"].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    for var in variables:
        if(median_variable=="air.sfc"):
            continue
        df_final=df_combined.copy()
        if(median_variable=="air.2m"):
            var_name="Temp_2m"
            df_final.drop(columns=["Prec","Hum"], inplace=True)
        if(median_variable=="apcp"):
            var_name="Prec"
            df_final.drop(columns=["Temp_2m","Hum"], inplace=True)
        if(median_variable=="rhum.2m"):
            var_name="Hum"
            df_final.drop(columns=["Temp_2m","Prec"], inplace=True)
        #To export data
        df_final.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_median_years_" + var_name +".csv", index=False)

This section deals with processing of weather variables other than temperature, precipitation and humidity

Calculating the values by taking average across all the years

In [62]:
variables=["filtered_aggregated_precipitation", "dry_days_filtered", "max_consecutive_dry_days", "gini_index_summer", "gini_index", "gini_weekly", "theil_index_summer", "theil_index", "theil_weekly"]

In [63]:
for variable in variables:
    if(variable=="filtered_aggregated_precipitation"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/aggregate/"+ variable + ".csv"
    if(variable=="dry_days_filtered" or variable=="max_consecutive_dry_days"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/dry_days/"+ variable + ".csv"
    if(variable=="gini_index_summer" or variable=="gini_index" or variable=="gini_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/gini/"+ variable + ".csv"
    if(variable=="theil_index_summer" or variable=="theil_index" or variable=="theil_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/theil/"+ variable + ".csv"
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    df_mod=df_mod.join(data.set_index('GEOID'), on='GEOID')
    
    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we
    
    #Calculating average of yearly variable across all years for every county and storing 
    #all the corresponding attributes in different lists so that a simplified dataframe can be made.
    #Months considered with temperature lying in CDD_temp range.
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        for j in range(inc_year+1, years[-1]+1):
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating mean of variable across all the years
            mean_variable.append(np.mean(summer_variable))
            mean_ir.append(np.mean(summer_ir))
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
            
    #For average across all the years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_all_years_" + variable + ".csv", index=False)

Calculating the values by taking random sample years among all the years

In [64]:
n_sample_years=5    #Number of years to be sampled

In [65]:
for variable in variables:
    if(variable=="filtered_aggregated_precipitation"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/aggregate/"+ variable + ".csv"
    if(variable=="dry_days_filtered" or variable=="max_consecutive_dry_days"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/dry_days/"+ variable + ".csv"
    if(variable=="gini_index_summer" or variable=="gini_index" or variable=="gini_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/gini/"+ variable + ".csv"
    if(variable=="theil_index_summer" or variable=="theil_index" or variable=="theil_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/theil/"+ variable + ".csv"
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    df_mod=df_mod.join(data.set_index('GEOID'), on='GEOID')
    
    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we

#Calculating average of yearly variable values for every county, and storing all the corresponding attributes 
#in different lists so that a simplified dataframe can be made. Only "n_sample_years" number of years data is considered.
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        sample_years=[]    # For storing n number of sample years
        for rand in range(inc_year+1, years[-1]+1):
            sample_years.append(rand)
        random.seed(10)
        if(len(sample_years)>n_sample_years):
            sample_years=random.sample(sample_years,n_sample_years)

        for j in sample_years:
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating mean of varaible across all the years
            mean_variable.append(np.mean(summer_variable))
            mean_ir.append(np.mean(summer_ir))
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
    
    #For average across all the sample years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_sample_years_" + variable + ".csv", index=False)

Calculating the values by taking median years values

In [66]:
for variable in variables:
    if(variable=="filtered_aggregated_precipitation"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/aggregate/"+ variable + ".csv"
    if(variable=="dry_days_filtered" or variable=="max_consecutive_dry_days"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/dry_days/"+ variable + ".csv"
    if(variable=="gini_index_summer" or variable=="gini_index" or variable=="gini_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/gini/"+ variable + ".csv"
    if(variable=="theil_index_summer" or variable=="theil_index" or variable=="theil_weekly"):
        data_path="/Users/sparshagarwal/Downloads/urban_county_indices/theil/"+ variable + ".csv"
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    df_mod=df_mod.join(data.set_index('GEOID'), on='GEOID')
    
    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we

#Calculating median of yearly variable values for every county, and storing all the corresponding attributes 
#in different lists so that a simplified dataframe can be made. 
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        for j in range(inc_year+1, years[-1]+1):
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating median of varaible across all the years
            l=summer_variable
            index=np.argsort(l)[len(l)//2]       #Index of median value
            mean_variable.append(summer_variable[index])
            mean_ir.append(summer_ir[index])     #Median corresponding to the variable of interest
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
        
    
    #For average across all the sample years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Weather_median_years_" + variable + ".csv", index=False)

This section deals with processing of socio-economic variables.

Calculating the values by taking average across all the years

In [67]:
variables=["Resident_population_White_alone_percent", "Median_Household_Income", "Poverty_percent_of_people"]

In [68]:
for variable in variables:
    if(variable=="Poverty_percent_of_people" or variable=="Median_Household_Income"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/se_data.csv"
        years=range(1999, 2010)
    if(variable=="Resident_population_White_alone_percent"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/race_data.csv"
        years=range(2000, 2010)
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    temp_df=pd.DataFrame()
    #Creating dataframe of socio economic variable
    for i in years:
        data_f=data[data["YEAR"]==i]
        geo_id=[]
        var_value=[]
        for j in range(1,len(data_f)):
            geo_id.append(data_f.iloc[j]["STCOU"])
            var_value.append(data_f.iloc[j][variable])
        temp_df["GEOID"]=geo_id
        temp_df[variable + "_" + str(i)]=var_value
    #The main dataframe comprising of everything
    df_mod=df_mod.join(temp_df.set_index('GEOID'), on='GEOID')

    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we
    
    #Calculating average of yearly variable across all years for every county and storing 
    #all the corresponding attributes in different lists so that a simplified dataframe can be made.
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        for j in range(inc_year+1, years[-1]+1):
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][variable + "_" + str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating mean of varaible across all the years
            mean_variable.append(np.mean(summer_variable))
            mean_ir.append(np.mean(summer_ir))
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
            
    #For average across all the years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Socio_all_years_" + variable + ".csv", index=False)

Calculating the values by taking random sample years among all the years

In [69]:
n_sample_years=5    #Number of years to be sampled

In [70]:
for variable in variables:
    if(variable=="Poverty_percent_of_people" or variable=="Median_Household_Income"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/se_data.csv"
        years=range(1999, 2010)
    if(variable=="Resident_population_White_alone_percent"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/race_data.csv"
        years=range(2000, 2010)
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    temp_df=pd.DataFrame()
    #Creating dataframe of socio economic variable
    for i in years:
        data_f=data[data["YEAR"]==i]
        geo_id=[]
        var_value=[]
        for j in range(1,len(data_f)):
            geo_id.append(data_f.iloc[j]["STCOU"])
            var_value.append(data_f.iloc[j][variable])
        temp_df["GEOID"]=geo_id
        temp_df[variable + "_" + str(i)]=var_value
    #The main dataframe comprising of everything
    df_mod=df_mod.join(temp_df.set_index('GEOID'), on='GEOID')

    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we
    
    #Calculating average of yearly variable values for every county, and storing all the corresponding attributes 
    #in different lists so that a simplified dataframe can be made. Only "n_sample_years" number of years data is considered.
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        sample_years=[]    # For storing n number of sample years
        for rand in range(inc_year+1, years[-1]+1):
            sample_years.append(rand)
        random.seed(10)
        if(len(sample_years)>n_sample_years):
            sample_years=random.sample(sample_years,n_sample_years)

        for j in sample_years:
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][variable + "_" + str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year!=years[-1]):
            # Calculating mean of varaible across all the years
            mean_variable.append(np.mean(summer_variable))
            mean_ir.append(np.mean(summer_ir))
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])
            
    #For average across all the years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Socio_sample_years_" + variable + ".csv", index=False)

Calculating the values by taking median years values

In [71]:
for variable in variables:
    if(variable=="Poverty_percent_of_people" or variable=="Median_Household_Income"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/se_data.csv"
        years=range(1999, 2010)
    if(variable=="Resident_population_White_alone_percent"):
        data_path="/Users/sparshagarwal/Downloads/WMV_data/Socioeconomics/race_data.csv"
        years=range(2000, 2010)
    # Adding data for the variable
    df_mod=df.copy()
    data=pd.read_csv(data_path)
    temp_df=pd.DataFrame()
    #Creating dataframe of socio economic variable
    for i in years:
        data_f=data[data["YEAR"]==i]
        geo_id=[]
        var_value=[]
        for j in range(1,len(data_f)):
            geo_id.append(data_f.iloc[j]["STCOU"])
            var_value.append(data_f.iloc[j][variable])
        temp_df["GEOID"]=geo_id
        temp_df[variable + "_" + str(i)]=var_value
    #The main dataframe comprising of everything
    df_mod=df_mod.join(temp_df.set_index('GEOID'), on='GEOID')

    #Adding socioeconomic and weather regions classification column
    sc=[]
    we=[]
    for i in range(len(df_mod)):
        sc.append(socio_regions[df_mod.iloc[i]["STUSPS"]])
        we.append(weather_regions[df_mod.iloc[i]["STUSPS"]])
    df_mod["Socio_econ_class"]=sc 
    df_mod["Weather_class"]=we
    
#Calculating median of yearly variable values for every county, and storing all the corresponding attributes 
#in different lists so that a simplified dataframe can be made.
    geo_id=[]
    mean_variable=[]
    mean_ir=[]
    w_cl=[]
    s_cl=[]
    for i in range(len(df_mod)):
        summer_variable=[]
        summer_ir=[]
        inc_year= df_mod.iloc[i]["Incident_year"]  # 1st year of WNV incidence

        for j in range(inc_year+1, years[-1]+1):
            #Calculating and storing the variable value for a certain year.
            summer_variable.append(df_mod.iloc[i][variable + "_" + str(j)])    
            summer_ir.append(df_mod.iloc[i]["NI_IR_" + str(j)])

        if(inc_year<years[-1]):
            # Calculating median of variable across all the years
            l=summer_variable
            index=np.argsort(l)[len(l)//2]       #Index of median value
            mean_variable.append(summer_variable[index])
            mean_ir.append(summer_ir[index])     #Median corresponding to the variable of interest
            geo_id.append(df_mod.iloc[i]["GEOID"])
            w_cl.append(df_mod.iloc[i]["Weather_class"])
            s_cl.append(df_mod.iloc[i]["Socio_econ_class"])       
            
    #For average across all the years
    df_combined=pd.DataFrame()
    df_combined["GEOID"]=geo_id
    df_combined["IR"]=mean_ir
    df_combined[variable]=mean_variable
    df_combined["Weather_class"]=w_cl
    df_combined["Socio_econ_class"]=s_cl
    
    df_combined=df_combined[df_combined[variable].isnull()==False]
    df_combined=df_combined[df_combined["IR"]!=0]   # Removing entries with IR=0
    
    #To export data
    df_combined.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal/Socio_median_years_" + variable + ".csv", index=False)

This section is for generating dataframes for chi-sqaure calculation for every variable, for entire USA.

In [72]:
#Path where the normal datframes are stored
folder_path="/Users/sparshagarwal/Desktop/NCSA/Dataframes/Normal"

In [73]:
files=os.listdir(folder_path)

In [74]:
for file in files:
    data_path= folder_path + "/" + file
    data= pd.read_csv(data_path)
    variable=data.columns[2]    #Variable for chi-square test
    
    #Average of IR and the variable for classification purpose
    temp_data=pd.read_csv(folder_path + "/" + file)   # For calculating average value of the variable for chi square classification
    avg_ir=np.mean(temp_data["IR"])
    avg_var=np.mean(temp_data[variable])
    l=[]
    for i in range(len(data)):
        if(data.iloc[i]["IR"]>=avg_ir):
            l.append("H")
        else:
            l.append("L")
    data["Class_IR"]=l
    l=[]
    for i in range(len(data)):
        if(data.iloc[i][variable]>=avg_var):
            l.append("H")
        else:
            l.append("L")
    data["Class_var"]=l
    #To export data
    data.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Chi/USA/" + file[:(len(file)-4)] + "_chi.csv", index=False)

This section is for generating dataframes for chi-sqaure calculation for every variable, for entire USA.

In [75]:
for file in files:
    data_path= folder_path + "/" + file
    data= pd.read_csv(data_path)
    variable=data.columns[2]    #Variable for chi-square test
    
    dic_ir={}  #To store average IR for each individual zone
    dic_var={} #To store average variable value for each individual zone
    for i in range(1,10):    #Iterating over weather/climate zones
        df_temp=data[data["Weather_class"]==i]
        #Average of IR and the variable for classification purpose
        avg_ir=np.mean(df_temp["IR"])
        avg_var=np.mean(df_temp[variable])
        dic_ir[i]=avg_ir
        dic_var[i]=avg_var
    l=[]
    for i in range(len(data)):
        if(data.iloc[i]["IR"]>=dic_ir[int(data.iloc[i]["Weather_class"])]):
            l.append("H")
        else:
            l.append("L")
    data["Class_IR"]=l

    l=[]
    for i in range(len(data)):
        if(data.iloc[i][variable]>=dic_var[int(data.iloc[i]["Weather_class"])]):
            l.append("H")
        else:
            l.append("L")
    data["Class_var"]=l
    #To export data
    data.to_csv("/Users/sparshagarwal/Desktop/NCSA/Dataframes/Chi/Zone_wise/" + file[:(len(file)-4)] + "_chi.csv", index=False)