# <span style="color:Maroon">Data Preparation - Part 2

<span style="color:Green">__Overview:__ Create dataset at county level. The columns store the values of Mortality rate (mean, upper and lower bounds) for different causes broken down by gender

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import os as os

In [2]:
os.chdir("..")
cwd = os.getcwd()

In [3]:
# Define the directories based on type of data
country_category_year = "Data\\country_category_year"
labels = "Data\\labels"
state_category_year = "Data\\state_category_year"
states_gender_year_cause = "Data\\states_gender_year_cause"

In [4]:
# define the filenames based on type of data
file_coun = "IHME_USA_COUNTY_MORTALITY_RATES_1980_2014_NATIONAL_Y2016M12D13.xlsx"
file_label = "IHME_USA_COUNTY_MORTALITY_RATES_1980_2014_CODEBOOK_Y2017M05D19.csv"
file_st_ct_yr1 = "IHME_USA_COUNTY_MORTALITY_RATES_1980_2014_"
file_st_ct_yr2 = "_Y2016M12D13"
file_st_gn_yr_cs1 = "IHME_USA_COUNTY_MORTALITY_RATES_1980_2014_"
file_st_gn_yr_cs2 = "_Y2017M05D19"

In [5]:
# List with state names
states = ["ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", 
          "DISTRICT_OF_COLUMBIA", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", 
          "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI",
          "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW_HAMPSHIRE", "NEW_JERSEY", "NEW_MEXICO", "NEW_YORK",
          "NORTH_CAROLINA", "NORTH_DAKOTA", "OHIO", "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE_ISLAND", 
          "SOUTH_CAROLINA", "SOUTH_DAKOTA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", 
          "WEST_VIRGINIA", "WISCONSIN", "WYOMING"]

In [6]:
# List with different categories
categories = ["HIV-AIDS & tuberculosis", "Diarrhea, lower respiratory", "Neglected tropical diseases",
              "Maternal disorders", "Neonatal disorders", "Nutritional deficiencies", "Other communicable, maternal, ",
              "Neoplasms", "Cardiovascular diseases", "Chronic respiratory diseases", "Cirrhosis & other chronic ",
              "Digestive diseases", "Neurological disorders", "Mental & substance use ", "Diabetes, urogenital, blood",
              "Musculoskeletal disorders", "Other non-communicable ", "Transport injuries", "Unintentional injuries",
              "Self-harm & interpersonal ", "Forces of nature, war, & legal "]

In [7]:
country = "UNITED_STATES"

## <span style="color:Maroon">Read the label file

In [8]:
os.chdir(labels)
labels_data = pd.read_csv(file_label,skiprows=1)
labels_data.head()

Unnamed: 0,Label:,Location ID,Location name,FIPS,Cause ID,Cause name,Sex ID,Sex name,Year ID,Mortality rate,95% Uncertainty Interval - Upper Bound,95% Uncertainty Interval - Lower Bound
0,Value coding:,102,United States,,294.0,All causes,1.0,Male,1980.0,,,
1,,523,Alabama,1.0,295.0,"Communicable, maternal, neonatal, and nutritio...",2.0,Female,1981.0,,,
2,,524,Alaska,2.0,296.0,HIV/AIDS and tuberculosis,3.0,Both,1982.0,,,
3,,525,Arizona,4.0,301.0,"Diarrhea, lower respiratory, and other common ...",,,1983.0,,,
4,,526,Arkansas,5.0,344.0,Neglected tropical diseases and malaria,,,1984.0,,,


In [9]:
cause_name = list(labels_data['Cause name'])
cause_name = list(dict.fromkeys(cause_name))
cause_name = cause_name[0:len(cause_name)-1]

In [10]:
# Dictionary mapping excel tab name to acronym
category_dict = {"ACA":"All causes",
                 "CMN":"Communicable, maternal, neonatal, and nutritional diseases",
                 "HVT":"HIV/AIDS and tuberculosis",
                 "DLR":"Diarrhea, lower respiratory, and other common infectious diseases",
                 "NTD":"Neglected tropical diseases and malaria",
                 "MDA":"Maternal disorders",
                 "NDA":"Neonatal disorders",
                 "NEA":"Nutritional deficiencies",
                 "OTA":"Other communicable, maternal, neonatal, and nutritional diseases",
                 "OCD":"Non-communicable diseases",
                 "NOA":"Neoplasms",
                 "CDA":"Cardiovascular diseases",
                 "CRE":"Chronic respiratory diseases",
                 "COC":"Cirrhosis and other chronic liver diseases",
                 "DDA":"Digestive diseases", 
                 "NIA":"Neurological disorders",
                 "MSU":"Mental and substance use disorders",
                 "DUB":"Diabetes, urogenital, blood, and endocrine diseases",
                 "MIA":"Musculoskeletal disorders",
                 "ONC":"Other non-communicable diseases",
                 "IAA":"Injuries",
                 "TIA":"Transport injuries",
                 "UIA":"Unintentional injuries",
                 "SHI":"Self-harm and interpersonal violence",
                 "FON":"Forces of nature, war, and legal intervention"}

In [11]:
category_key = ["ACA","CMN","HVT","DLR","NTD","MDA","NDA","NEA","OTA","OCD","NOA","CDA","CRE","COC","DDA","NIA","MSU",
                "DUB", "MIA","ONC","IAA","TIA","UIA","SHI","FON"]

In [12]:
# Check if the list is unique (no two similar names)
len(category_key) == len(list(dict.fromkeys(category_key)))

True

## <span style="color:Maroon">Read Data at State level and break it down by Gender level

In [13]:
os.chdir(f'{cwd}\\{states_gender_year_cause}')

In [14]:
def change_causename(df):
    for i in range(0,len(category_key)):
        df = df.replace(category_dict[category_key[i]],category_key[i])
    return df

In [15]:
def newdata_transpose(df,fips):
    abc = df.iloc[0,1]
    df['columns']=df['cause_name']+"_"+df["sex_id"].astype(str)+"_"+df["year_id"].astype(str)
    df = df[["columns","mx"]]
    df = df.set_index("columns")
    df = df.transpose()
    df = df.rename(index={'mx': fips})
    return df

In [16]:
outp_data = pd.DataFrame()

In [17]:
for i in range(0,len(states)):
    data = pd.read_csv(file_st_gn_yr_cs1+states[i]+file_st_gn_yr_cs2+".csv")
    data = change_causename(data)
    FIPS = list(np.unique(np.array(list(data["FIPS"]))))
    for j in range(len(FIPS)):
        tmp = (data[data["FIPS"]==FIPS[j]]).drop(["lower","upper"],axis=1)
        outp_data = outp_data.append(newdata_transpose(tmp,FIPS[j]))

In [18]:
outp_data.shape

(3193, 2590)

In [19]:
outp_data.index.name = "FIPS"

In [20]:
# Split data in two chunks to save
outp_data_split = np.array_split(outp_data,5)
outp_data0 = pd.DataFrame(outp_data_split[0])
outp_data1 = pd.DataFrame(outp_data_split[1])
outp_data2 = pd.DataFrame(outp_data_split[2])
outp_data3 = pd.DataFrame(outp_data_split[3])
outp_data4 = pd.DataFrame(outp_data_split[4])

In [21]:
outp_data0.to_csv("..//Model data//Mortality_yearly_category_gender_part0.csv")
outp_data1.to_csv("..//Model data//Mortality_yearly_category_gender_part1.csv")
outp_data2.to_csv("..//Model data//Mortality_yearly_category_gender_part2.csv")
outp_data3.to_csv("..//Model data//Mortality_yearly_category_gender_part3.csv")
outp_data4.to_csv("..//Model data//Mortality_yearly_category_gender_part4.csv")