In [1]:
from pygam import LinearGAM, s, f

In [2]:
import dask.dataframe as ddf
import pandas as pd
import datetime
import functools
import seaborn as sns
import matplotlib.pyplot as plt

### Import suicide death data

In [3]:
years = [year for year in range(1960, 2020)]
years[-10:]

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [4]:
data = []
for year in years:
    suicide_year = pd.read_csv('/global/cfs/cdirs/m1532/Projects_MVP/geospatial/Suicide_Death_NCHS/suicide_patient_level_1960_2020/overall/overall_suicide_patient_level_' + str(year) + '.csv', dtype = {'year': int, 'county_residence': str, 'county_death': str, 'state_residence': str, 'state_death': str})
    suicide_year = suicide_year.loc[:, ~suicide_year.columns.str.contains('^Unnamed')]
    data.append(suicide_year)

In [5]:
suicide = pd.concat(data)
suicide.head()

Unnamed: 0,year,county_residence,state_residence,state_death,county_death,month,sex,race,age,age_range,death_cause,date
0,1960,23001,23,23,23001,Dec,Male,White,7,age 25-64,nonfirearm_suicide,
1,1960,48201,48,48,48201,Dec,Male,Black,10,65 and older,nonfirearm_suicide,
2,1960,6037,6,6,6037,Apr,Female,White,8,age 25-64,nonfirearm_suicide,
3,1960,48441,48,48,48441,Nov,Male,White,7,age 25-64,nonfirearm_suicide,
4,1960,34013,34,34,34013,Feb,Male,White,8,age 25-64,nonfirearm_suicide,


In [6]:
suicide = suicide.drop(['county_residence', 'state_residence'], axis = 1)
suicide = suicide.rename(columns = {'state_death': 'statefips', 'county_death': 'fips'})
suicide.head()

Unnamed: 0,year,statefips,fips,month,sex,race,age,age_range,death_cause,date
0,1960,23,23001,Dec,Male,White,7,age 25-64,nonfirearm_suicide,
1,1960,48,48201,Dec,Male,Black,10,65 and older,nonfirearm_suicide,
2,1960,6,6037,Apr,Female,White,8,age 25-64,nonfirearm_suicide,
3,1960,48,48441,Nov,Male,White,7,age 25-64,nonfirearm_suicide,
4,1960,34,34013,Feb,Male,White,8,age 25-64,nonfirearm_suicide,


In [7]:
suicide

Unnamed: 0,year,statefips,fips,month,sex,race,age,age_range,death_cause,date
0,1960,23,23001,Dec,Male,White,7,age 25-64,nonfirearm_suicide,
1,1960,48,48201,Dec,Male,Black,10,65 and older,nonfirearm_suicide,
2,1960,06,06037,Apr,Female,White,8,age 25-64,nonfirearm_suicide,
3,1960,48,48441,Nov,Male,White,7,age 25-64,nonfirearm_suicide,
4,1960,34,34013,Feb,Male,White,8,age 25-64,nonfirearm_suicide,
...,...,...,...,...,...,...,...,...,...,...
47669,2019,36,36081,Dec,Male,Other,5,age 25-64,nonfirearm_suicide,
47670,2019,36,36081,Dec,Male,Black,5,age 25-64,nonfirearm_suicide,
47671,2019,36,36005,Dec,Female,Black,4,age 0-24,nonfirearm_suicide,
47672,2019,36,36005,Dec,Male,White,6,age 25-64,nonfirearm_suicide,


### Load population data

In [8]:
data_pop = []
for year in years:
    pop_year = pd.read_csv('/global/cfs/cdirs/m1532/Projects_MVP/geospatial/temp_bins_suicide/Population/population_monthly/population_monthly_' + str(year) + '.csv', dtype = {'year': int, 'fips': str})
    pop_year = pop_year.loc[:, ~pop_year.columns.str.contains('^Unnamed')]
    data_pop.append(pop_year)

In [9]:
pop = pd.concat(data_pop)
pop.head()

Unnamed: 0,year,fips,pop,month
0,1960,1001,18686,Jan
1,1960,1001,18691,Feb
2,1960,1001,18696,Mar
3,1960,1001,18701,Apr
4,1960,1001,18705,May


In [10]:
month_replace = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10,
                'Nov': 11, 'Dec': 12}
pop['month'] = pop['month'].replace(month_replace)
pop['month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [11]:
pop.head()

Unnamed: 0,year,fips,pop,month
0,1960,1001,18686,1
1,1960,1001,18691,2
2,1960,1001,18696,3
3,1960,1001,18701,4
4,1960,1001,18705,5


In [12]:
pop_year = pop[pop['month'] == 12]
pop_year.head()

Unnamed: 0,year,fips,pop,month
11,1960,1001,18739,12
23,1960,1003,49088,12
35,1960,1005,24700,12
47,1960,1007,14357,12
59,1960,1009,25449,12


In [13]:
def findState(fips):
    return fips[:2]

In [14]:
pop_year['statefips'] = pop_year['fips'].apply(findState)
pop_year.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_year['statefips'] = pop_year['fips'].apply(findState)


Unnamed: 0,year,fips,pop,month,statefips
11,1960,1001,18739,12,1
23,1960,1003,49088,12,1
35,1960,1005,24700,12,1
47,1960,1007,14357,12,1
59,1960,1009,25449,12,1


In [15]:
pop_state = pop_year.drop(['fips', 'month'], axis = 1)
pop_state = pop_state.groupby(['year', 'statefips']).sum()
pop_state = pop_state.reset_index()
pop_state.head()

Unnamed: 0,year,statefips,pop
0,1960,1,3266740
1,1960,4,1302161
2,1960,5,1786272
3,1960,6,15717204
4,1960,8,1753947


In [16]:
suicide_death = suicide.drop(['sex', 'race', 'age', 'age_range', 'date', 'fips', 'month'], axis = 1)
suicide_death = suicide_death.groupby(['year', 'statefips']).count()
suicide_death = suicide_death.reset_index()
suicide_death.head()

Unnamed: 0,year,statefips,death_cause
0,1960,0,32
1,1960,1,221
2,1960,4,111
3,1960,5,134
4,1960,6,1535


In [17]:
suicide_death['year'].unique()

array([1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019])

In [18]:
suicide_death = suicide_death.rename(columns = {'death_cause': 'deaths'})

In [19]:
suicide_pop = suicide_death.merge(pop_state, on = ['year','statefips'], how = 'inner')
suicide_pop.head()

Unnamed: 0,year,statefips,deaths,pop
0,1960,1,221,3266740
1,1960,4,111,1302161
2,1960,5,134,1786272
3,1960,6,1535,15717204
4,1960,8,155,1753947


In [20]:
state = pd.read_excel('/global/cfs/cdirs/m1532/Projects_MVP/geospatial/temp_bins_suicide/test_paper/state_code.xlsx', engine='openpyxl', dtype = {'Code': int})
state.head()

Unnamed: 0,Code,State,Abbreviation,Alpha code
0,1,Alabama,Ala.,AL
1,2,Alaska,,AK
2,4,Arizona,Ariz.,AZ
3,5,Arkansas,Ark.,AR
4,6,California,Calif.,CA


In [21]:
def convertStateCode(code):
    return str(code).rjust(2, '0')

In [22]:
state['Code'] = state['Code'].apply(convertStateCode)
state = state.rename(columns = {'Code': 'statefips', 'State': 'state_name', 'Alpha code': 'state'})
state.head()

Unnamed: 0,statefips,state_name,Abbreviation,state
0,1,Alabama,Ala.,AL
1,2,Alaska,,AK
2,4,Arizona,Ariz.,AZ
3,5,Arkansas,Ark.,AR
4,6,California,Calif.,CA


In [23]:
west_states = ['Colorado', 'Wyoming', 'Montana', 'Idaho', 'Washington',
              'Oregon', 'Utah', 'Nevada', 'California', 'Alaska', 'Hawaii']
Midwest_states = ['Ohio', 'Indiana', 'Michigan', 'Illinois', 'Missouri', 
                  'Wisconsin', 'Minnesota', 'Iowa', 'Kansas', 'Nebraska', 'South Dakota', 'North Dakota']
Southwest_states = ['Texas', 'Oklahoma', 'New Mexico', 'Arizona']
Southeast_states = ['West Virginia', 'Virginia', 'Kentucky', 'Tennessee', 'North Carolina', 
                    'South Carolina', 'Georgia', 'Alabama', 'Mississippi', 'Arkansas', 'Louisiana', 'Florida']
Northeast_states = ['Maine', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New Hampshire', 'Vermont', 
                    'New York', 'Pennsylvania', 'New Jersey', 'Delaware', 'Maryland']

In [24]:
#use a function to get region information
def get_region(state):
    if state in west_states:
        return 'West'
    elif state in Midwest_states:
        return 'Midwest'
    elif state in Southwest_states:
        return 'Southwest'
    elif state in Southeast_states:
        return 'Southeast'
    else:
        return 'Northeast'

In [25]:
suicide_pop = suicide_pop.merge(state[['statefips', 'state_name', 'state']], on = ['statefips'], how = 'inner')
suicide_pop.head()

Unnamed: 0,year,statefips,deaths,pop,state_name,state
0,1960,1,221,3266740,Alabama,AL
1,1961,1,278,3281534,Alabama,AL
2,1962,1,243,3296946,Alabama,AL
3,1963,1,252,3312978,Alabama,AL
4,1964,1,267,3329632,Alabama,AL


In [26]:
suicide_pop['region'] = suicide_pop['state_name'].apply(get_region)
suicide_pop.head()

Unnamed: 0,year,statefips,deaths,pop,state_name,state,region
0,1960,1,221,3266740,Alabama,AL,Southeast
1,1961,1,278,3281534,Alabama,AL,Southeast
2,1962,1,243,3296946,Alabama,AL,Southeast
3,1963,1,252,3312978,Alabama,AL,Southeast
4,1964,1,267,3329632,Alabama,AL,Southeast


In [27]:
suicide_pop['suicide_rate'] = (suicide_pop['deaths']/suicide_pop['pop'])*100000
suicide_pop.head()

Unnamed: 0,year,statefips,deaths,pop,state_name,state,region,suicide_rate
0,1960,1,221,3266740,Alabama,AL,Southeast,6.765154
1,1961,1,278,3281534,Alabama,AL,Southeast,8.471648
2,1962,1,243,3296946,Alabama,AL,Southeast,7.370457
3,1963,1,252,3312978,Alabama,AL,Southeast,7.60645
4,1964,1,267,3329632,Alabama,AL,Southeast,8.018904


In [28]:
suicide_pop['year'].unique()

array([1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019])

### Load heatwave data

In [29]:
heatwave_count = pd.read_csv('/global/cfs/cdirs/m1532/Projects_MVP/geospatial/climate_heatwave/heatwave_details/PRISM_min_AT_p85/heatwave_count_monthly_1960_2020.csv', dtype = {'fips': str})
heatwave_count = heatwave_count.loc[:, ~heatwave_count.columns.str.contains('^Unnamed')]
heatwave_count.head()

Unnamed: 0,year,month,fips,heatwave_count
0,1960,1,1001,0
1,1960,1,1003,0
2,1960,1,1005,0
3,1960,1,1007,0
4,1960,1,1009,0


In [30]:
heatwave_count['month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [31]:
heatwave_count['statefips'] = heatwave_count['fips'].apply(findState)
heatwave_count.head()

Unnamed: 0,year,month,fips,heatwave_count,statefips
0,1960,1,1001,0,1
1,1960,1,1003,0,1
2,1960,1,1005,0,1
3,1960,1,1007,0,1
4,1960,1,1009,0,1


In [32]:
heatwave_count['heatwave_count'].describe()

count    2.235600e+06
mean     2.037207e-01
std      6.048997e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      7.000000e+00
Name: heatwave_count, dtype: float64

In [33]:
heatwave_count['heatwave_count'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7])

In [34]:
heatwave_year = heatwave_count.drop('month', axis = 1)
heatwave_year = heatwave_year.groupby(['year', 'statefips', 'fips']).sum()
heatwave_year = heatwave_year.reset_index()
heatwave_year.head()

Unnamed: 0,year,statefips,fips,heatwave_count
0,1960,1,1001,3
1,1960,1,1003,0
2,1960,1,1005,3
3,1960,1,1007,0
4,1960,1,1009,0


In [35]:
heatwave_state = heatwave_year.drop(['fips'], axis = 1)
heatwave_state = heatwave_state.groupby(['year','statefips']).median()
heatwave_state = heatwave_state.reset_index()
heatwave_state.head()

Unnamed: 0,year,statefips,heatwave_count
0,1960,1,0.0
1,1960,4,2.0
2,1960,5,0.0
3,1960,6,3.0
4,1960,8,1.0


In [36]:
heatwave_state['heatwave_count'].describe()

count    2940.000000
mean        2.343878
std         1.649372
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         9.000000
Name: heatwave_count, dtype: float64

In [38]:
heatwave_state['heatwave_count'].nunique()

19

In [39]:
suicide_climate = suicide_pop.merge(heatwave_state, on = ['year', 'statefips'], how = 'left')
suicide_climate.head()

Unnamed: 0,year,statefips,deaths,pop,state_name,state,region,suicide_rate,heatwave_count
0,1960,1,221,3266740,Alabama,AL,Southeast,6.765154,0.0
1,1961,1,278,3281534,Alabama,AL,Southeast,8.471648,0.0
2,1962,1,243,3296946,Alabama,AL,Southeast,7.370457,1.0
3,1963,1,252,3312978,Alabama,AL,Southeast,7.60645,0.0
4,1964,1,267,3329632,Alabama,AL,Southeast,8.018904,0.0


In [40]:
suicide_climate['heatwave_count'].isna().sum()

76

In [41]:
suicide_climate['heatwave_count'].unique()

array([0. , 1. , 4. , 2. , 3. , 5. , 6. , 8. , 7. , 1.5, 2.5, 0.5, 3.5,
       4.5, 5.5, 9. , nan, 8.5, 6.5, 7.5])

In [42]:
suicide_climate['heatwave_count'].describe()

count    2940.000000
mean        2.343878
std         1.649372
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         9.000000
Name: heatwave_count, dtype: float64

In [43]:
suicide_climate.to_csv('GAM_heatwave/yearly_GAM_heatwave_state_1960_2019.csv')