In [1]:
import pandas as pd, numpy as np

## Search Activity by Year from Google

In [2]:
# import Google trends data
df1 = pd.read_csv("combinedGeoMap.csv").drop(columns=["Unnamed: 0"])
df1.head()

In [3]:
# scale the whole table to fit a uniform 1-100 scale
df1_max = df1.drop(columns=["Region"]).max().max()

for col in df1.columns.values[1:]:
    df1[col] = (df1[col]/df1_max)*100

df1 = df1.sort_values(by=['Region']).round(1)
df1

Unnamed: 0,Region,Accident,Alzheimers,CLRD,Stroke,Diabetes,Cancer,Suicide,Flu,Heart,Kidney
24,Alabama,84.1,11.0,7.3,12.2,34.1,92.7,20.7,72.0,19.5,4.9
3,Alaska,68.3,12.2,4.9,12.2,31.7,86.6,20.7,69.5,15.9,3.7
7,Arizona,82.9,11.0,6.1,12.2,28.0,84.1,23.2,72.0,14.6,3.7
0,Arkansas,80.5,11.0,7.3,12.2,28.0,81.7,20.7,72.0,14.6,3.7
41,California,87.8,13.4,6.1,12.2,31.7,86.6,23.2,79.3,14.6,3.7
29,Colorado,87.8,12.2,4.9,12.2,28.0,89.0,25.6,75.6,15.9,3.7
38,Connecticut,96.3,13.4,7.3,12.2,31.7,92.7,20.7,80.5,17.1,4.9
23,Delaware,95.1,12.2,6.1,12.2,28.0,86.6,23.2,74.4,15.9,3.7
45,District of Columbia,84.1,13.4,6.1,12.2,31.7,95.1,23.2,80.5,17.1,4.9
17,Florida,90.2,11.0,6.1,9.8,28.0,92.7,20.7,67.1,13.4,3.7


In [4]:
df1.to_csv("Google_by_state.csv")

## Leading Causes of Death from the NCHS

In [5]:
df2 = pd.read_csv("NCHS_-_Leading_Causes_of_Death__United_States.csv")
df2.head()

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
0,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,United States,169936,49.4
1,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alabama,2703,53.8
2,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alaska,436,63.7
3,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arizona,4184,56.2
4,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arkansas,1625,51.8


In [6]:
print("Cause Names: ", df2["Cause Name"].unique())
print("\nYears: ", df2["Year"].unique())
print("\nStates: ", df2["State"].nunique()) # includes United States total + Washington D.C.

Cause Names:  ['Unintentional injuries' 'All causes' "Alzheimer's disease" 'Stroke'
 'CLRD' 'Diabetes' 'Heart disease' 'Influenza and pneumonia' 'Suicide'
 'Cancer' 'Kidney disease']

Years:  [2017 2007 2016 2015 2014 2013 2012 2011 2010 2009 2008 2006 2005 2004
 2003 2002 2001 2000 1999]

States:  52


In [7]:
# restrict to totaled United States numbers
df2 = df2[df2["State"]!="United States"]

# restrict to 2004 and onwards
df2 = df2[df2["Year"]>=2004]

# drop redundant cause column and state columns
df2 = df2.drop(columns=["113 Cause Name"])

# don't include "All Causes"
df2 = df2[df2["Cause Name"]!="All causes"]

df2.head(10)

Unnamed: 0,Year,Cause Name,State,Deaths,Age-adjusted Death Rate
1,2017,Unintentional injuries,Alabama,2703,53.8
2,2017,Unintentional injuries,Alaska,436,63.7
3,2017,Unintentional injuries,Arizona,4184,56.2
4,2017,Unintentional injuries,Arkansas,1625,51.8
5,2017,Unintentional injuries,California,13840,33.2
6,2017,Unintentional injuries,Colorado,3037,53.6
7,2017,Unintentional injuries,Connecticut,2078,53.2
8,2017,Unintentional injuries,Delaware,608,61.9
9,2017,Unintentional injuries,District of Columbia,427,61.0
10,2017,Unintentional injuries,Florida,13059,56.1


In [8]:
# average the age-adjusted death rate across all years for each state
df2_b = df2.groupby(by=["State","Cause Name"]).mean()[["Age-adjusted Death Rate"]].reset_index().round(1)
df2_b.head(20)

Unnamed: 0,State,Cause Name,Age-adjusted Death Rate
0,Alabama,Alzheimer's disease,34.0
1,Alabama,CLRD,53.7
2,Alabama,Cancer,188.9
3,Alabama,Diabetes,25.0
4,Alabama,Heart disease,240.0
5,Alabama,Influenza and pneumonia,19.3
6,Alabama,Kidney disease,20.5
7,Alabama,Stroke,54.0
8,Alabama,Suicide,13.7
9,Alabama,Unintentional injuries,51.5


In [9]:
# create pivot such that each cause name is its own column
df2 = df2_b.pivot(index="State",columns="Cause Name", values=["Age-adjusted Death Rate"]).reset_index()

df2.head()

Unnamed: 0_level_0,State,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate,Age-adjusted Death Rate
Cause Name,Unnamed: 1_level_1,Alzheimer's disease,CLRD,Cancer,Diabetes,Heart disease,Influenza and pneumonia,Kidney disease,Stroke,Suicide,Unintentional injuries
0,Alabama,34.0,53.7,188.9,25.0,240.0,19.3,20.5,54.0,13.7,51.5
1,Alaska,22.2,41.1,170.9,21.2,150.7,12.9,11.0,42.6,22.9,56.1
2,Arizona,33.6,43.9,150.8,21.6,155.7,13.7,7.2,33.3,17.1,49.9
3,Arkansas,30.3,56.4,192.8,26.6,228.0,22.6,20.6,52.7,16.1,49.3
4,California,29.8,36.5,153.9,21.4,167.0,17.9,8.1,40.2,10.1,30.3


In [10]:
# df2.to_csv("NCHS_draft.csv")

In [11]:
df2 = pd.read_csv("NCHS_draft.csv")
df2.head()

Unnamed: 0,State,Alzheimer's disease,CLRD,Cancer,Diabetes,Heart disease,Influenza and pneumonia,Kidney disease,Stroke,Suicide,Unintentional injuries
0,Alabama,34.0,53.7,188.9,25.0,240.0,19.3,20.5,54.0,13.7,51.5
1,Alaska,22.2,41.1,170.9,21.2,150.7,12.9,11.0,42.6,22.9,56.1
2,Arizona,33.6,43.9,150.8,21.6,155.7,13.7,7.2,33.3,17.1,49.9
3,Arkansas,30.3,56.4,192.8,26.6,228.0,22.6,20.6,52.7,16.1,49.3
4,California,29.8,36.5,153.9,21.4,167.0,17.9,8.1,40.2,10.1,30.3


In [12]:
# rename columns to match the Google data
df2.rename(columns={"Alzheimer's disease": "Alzheimers", 
                   "Heart disease": "Heart", 
                   "Influenza and pneumonia": "Flu",
                   "Kidney disease": "Kidney",
                   "Unintentional injuries": "Accident"
                  }, inplace=True)
df2.head()

Unnamed: 0,State,Alzheimers,CLRD,Cancer,Diabetes,Heart,Flu,Kidney,Stroke,Suicide,Accident
0,Alabama,34.0,53.7,188.9,25.0,240.0,19.3,20.5,54.0,13.7,51.5
1,Alaska,22.2,41.1,170.9,21.2,150.7,12.9,11.0,42.6,22.9,56.1
2,Arizona,33.6,43.9,150.8,21.6,155.7,13.7,7.2,33.3,17.1,49.9
3,Arkansas,30.3,56.4,192.8,26.6,228.0,22.6,20.6,52.7,16.1,49.3
4,California,29.8,36.5,153.9,21.4,167.0,17.9,8.1,40.2,10.1,30.3


In [13]:
# scale the whole table to fit a uniform 1-100 scale
df2_max = df2.drop(columns=["State"]).max().max() # 125

for col in df2.columns.values[1:]:
    df2[col] = (df2[col]/df2_max)*100

df2 = df2.round(1)
df2

Unnamed: 0,State,Alzheimers,CLRD,Cancer,Diabetes,Heart,Flu,Kidney,Stroke,Suicide,Accident
0,Alabama,13.2,20.9,73.6,9.7,93.5,7.5,8.0,21.0,5.3,20.1
1,Alaska,8.6,16.0,66.6,8.3,58.7,5.0,4.3,16.6,8.9,21.9
2,Arizona,13.1,17.1,58.7,8.4,60.7,5.3,2.8,13.0,6.7,19.4
3,Arkansas,11.8,22.0,75.1,10.4,88.8,8.8,8.0,20.5,6.3,19.2
4,California,11.6,14.2,60.0,8.3,65.1,7.0,3.2,15.7,3.9,11.8
5,Colorado,11.7,18.9,57.0,6.5,53.8,5.3,3.9,14.3,7.1,18.1
6,Connecticut,6.8,12.5,62.0,6.2,62.1,5.8,5.0,12.0,3.5,14.8
7,Delaware,8.3,16.6,70.6,8.2,72.1,5.6,6.2,15.9,4.5,16.2
8,District of Columbia,7.7,9.4,72.5,10.1,88.7,5.4,4.2,14.2,2.3,14.5
9,Florida,7.3,15.2,63.3,7.9,64.5,3.9,4.3,13.8,5.3,17.7


In [14]:
df2.to_csv("NCHS_by_state.csv")

## Difference between Google Search interest and actual mortality rates

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
# create a final dataframe that combines the two
df_diff = df2[["State"]]

for col in df2.columns.values[1:]:
    df_diff[col] = df1[col] - df2[col]

df_diff

Unnamed: 0,State,Alzheimers,CLRD,Cancer,Diabetes,Heart,Flu,Kidney,Stroke,Suicide,Accident
0,Alabama,-2.2,-13.6,8.1,18.3,-78.9,64.5,-4.3,-8.8,15.4,60.4
1,Alaska,-0.1,-11.1,10.2,19.7,-46.5,59.6,-1.9,-6.8,16.7,56.1
2,Arizona,-2.1,-11.0,20.6,25.7,-44.8,70.3,0.9,-0.8,14.0,59.9
3,Arkansas,0.4,-17.1,11.5,21.3,-72.9,60.7,-4.3,-8.3,14.4,49.1
4,California,-0.6,-8.1,19.3,23.4,-49.2,68.6,0.5,-3.5,19.3,79.7
5,Colorado,0.5,-12.8,27.1,21.5,-37.9,64.2,-0.2,-2.1,13.6,57.5
6,Connecticut,4.2,-6.4,17.3,30.4,-47.5,60.1,-0.1,0.2,17.2,62.0
7,Delaware,2.7,-10.5,13.5,19.8,-57.5,66.4,-2.5,-3.7,18.7,66.7
8,District of Columbia,4.5,-2.1,14.1,17.9,-72.8,69.0,-0.5,-2.0,18.4,68.4
9,Florida,4.9,-9.1,7.4,23.8,-49.9,72.9,-0.6,-1.6,22.7,67.7


In [17]:
df_diff.to_csv("diff_by_state.csv")