In [2]:
def DeathRatePipeline(string_url): #string_url is the particular dataset identifier from NYC Open Data
    import sys
    !{sys.executable} -m pip install sodapy
    import pandas as pd
    from sodapy import Socrata
    import numpy as np
    import seaborn as sns
    client = Socrata("data.cityofnewyork.us", None)
    results = client.get(string_url, limit=2000) #change limit if dataset is larger than 2000 rows

    # Convert to pandas DataFrame
    df = pd.DataFrame.from_records(results)
    
    #Change sex strings to integers
    df.loc[df['sex'] == 'F', 'sex'] = 0
    df.loc[df['sex'] == 'M', 'sex'] = 1
    df.loc[df['sex'] == 'Female', 'sex'] = 0
    df.loc[df['sex'] == 'Male', 'sex'] = 1
    
    #Change race strings to integers
    df.loc[df['race_ethnicity'] == 'Asian and Pacific Islander', 'race_ethnicity'] = 0
    df.loc[df['race_ethnicity'] == 'Black Non-Hispanic', 'race_ethnicity'] = 1
    df.loc[df['race_ethnicity'] == 'Hispanic', 'race_ethnicity'] = 2
    df.loc[df['race_ethnicity'] == 'White Non-Hispanic', 'race_ethnicity'] = 3
    df.loc[df['race_ethnicity'] == 'Non-Hispanic Black', 'race_ethnicity'] = 4
    df.loc[df['race_ethnicity'] == 'Non-Hispanic White', 'race_ethnicity'] = 5
    
    #Now labeling the disease states with numbers with numbers for use in the GBM
    #Please adjust for different diseases.
    df1 = df #dataframe modified

    df1.loc[df['leading_cause'] == 'Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)', 'leading_cause'] = 0
    df1.loc[df['leading_cause'] == 'All Other Causes', 'leading_cause'] = 1
    df1.loc[df['leading_cause'] == "Alzheimer's Disease (G30)", 'leading_cause'] = 2
    df1.loc[df['leading_cause'] == 'Aortic Aneurysm and Dissection (I71)', 'leading_cause'] = 3
    df1.loc[df['leading_cause'] == 'Assault (Homicide: Y87.1, X85-Y09)', 'leading_cause'] = 4
    df1.loc[df['leading_cause'] == 'Atherosclerosis (I70)', 'leading_cause'] = 5
    df1.loc[df['leading_cause'] == 'Cerebrovascular Disease (Stroke: I60-I69)', 'leading_cause'] = 6
    df1.loc[df['leading_cause'] == 'Certain Conditions originating in the Perinatal Period (P00-P96)', 'leading_cause'] = 7
    df1.loc[df['leading_cause'] == 'Chronic Liver Disease and Cirrhosis (K70, K73)', 'leading_cause'] = 8
    df1.loc[df['leading_cause'] == 'Chronic Lower Respiratory Diseases (J40-J47)', 'leading_cause'] = 9
    df1.loc[df['leading_cause'] == 'Congenital Malformations, Deformations, and Chromosomal Abnormalities (Q00-Q99)', 'leading_cause'] = 10
    df1.loc[df['leading_cause'] == 'Diabetes Mellitus (E10-E14)', 'leading_cause'] = 11
    df1.loc[df['leading_cause'] == 'Diseases of Heart (I00-I09, I11, I13, I20-I51)', 'leading_cause'] = 12
    df1.loc[df['leading_cause'] == 'Essential Hypertension and Renal Diseases (I10, I12)', 'leading_cause'] = 13
    df1.loc[df['leading_cause'] == 'Human Immunodeficiency Virus Disease (HIV: B20-B24)', 'leading_cause'] = 14
    df1.loc[df['leading_cause'] == 'Influenza (Flu) and Pneumonia (J09-J18)', 'leading_cause'] = 15
    df1.loc[df['leading_cause'] == 'Insitu or Benign / Uncertain Neoplasms (D00-D48)', 'leading_cause'] = 16
    df1.loc[df['leading_cause'] == 'Intentional Self-Harm (Suicide: X60-X84, Y87.0)', 'leading_cause'] = 17
    df1.loc[df['leading_cause'] == 'Malignant Neoplasms (Cancer: C00-C97)', 'leading_cause'] = 18
    df1.loc[df['leading_cause'] == 'Mental and Behavioral Disorders due to Accidental Poisoning and Other Psychoactive Substance Use (F11-F16, F18-F19, X40-X42, X44)', 'leading_cause'] = 19
    df1.loc[df['leading_cause'] == 'Nephritis, Nephrotic Syndrome and Nephrisis (N00-N07, N17-N19, N25-N27)', 'leading_cause'] = 20
    df1.loc[df['leading_cause'] == 'Septicemia (A40-A41)', 'leading_cause'] = 21
    df1.loc[df['leading_cause'] == 'Viral Hepatitis (B15-B19)', 'leading_cause'] = 22
    df1.loc[df['leading_cause'] == "Parkinson's Disease (G20)", 'leading_cause'] = 23
    df1.loc[df['leading_cause'] == 'Tuberculosis (A16-A19)', 'leading_cause'] = 24
    df1.loc[df['leading_cause'] == 'Accidents Except Drug Poisoning (V01-X39, X43, X45-X59, Y85-Y86)', 'leading_cause'] = 0 #Repeat of first one, but has an error
    df1.loc[df['leading_cause'] == 'Peptic Ulcer (K25-K28)', 'leading_cause'] = 25
    df1.loc[df['leading_cause'] == 'Assault (Homicide: U01-U02, Y87.1, X85-Y09)', 'leading_cause'] = 26
    df1.loc[df['leading_cause'] == 'Chronic Liver Disease and Cirrhosis (K70, K73-K74)', 'leading_cause'] = 27
    df1.loc[df['leading_cause'] == 'Intentional Self-Harm (Suicide: U03, X60-X84, Y87.0)', 'leading_cause'] = 28
    df1.loc[df['leading_cause'] == 'Meningitis (G00, G03)', 'leading_cause'] = 29
    df1.loc[df['leading_cause'] == 'Mental and Behavioral Disorders due to Use of Alcohol (F10)', 'leading_cause'] = 30
    
    #Now getting rid of the race and ethnicities with unknown quantities for death and death rate
    df2 = df1.dropna() #dataframe modified
    df2 = df2[df2.race_ethnicity != 'Not Stated/Unknown']
    df2 = df2[df2.race_ethnicity != 'Other Race/ Ethnicity']
    
    df2.deaths= df2.deaths.astype(float)
    df2.age_adjusted_death_rate = df2.age_adjusted_death_rate.astype(float)
    df2.death_rate = df2.death_rate.astype(float)
    df2.leading_cause = df2.leading_cause.astype(float)
    df2.race_ethnicity = df2.race_ethnicity.astype(float)
    df2.sex = df2.sex.astype(float)
    df2.to_csv(r'C:\Users\wr225\Desktop\'NYC DEATH DATA.csv')
    
    return df, df1, df2

In [3]:
DeathRatePipeline('uvxr-2jwn')





(     age_adjusted_death_rate death_rate deaths leading_cause  \
 0                        7.6        6.2     32             0   
 1                        8.1        8.3     87             0   
 2                        7.1        6.1     71             0   
 3                          .          .      .             0   
 4                          .          .      .             0   
 5                        7.3       11.3    162             0   
 6                         13         11     53             0   
 7                       20.6       18.5    158             0   
 8                       17.4       14.1    154             0   
 9                          .          .      8             0   
 10                         .          .      5             0   
 11                      19.8       22.2    297             0   
 12                      44.6       35.3    182             1   
 13                     113.4      116.8   1230             1   
 14                      