In [1]:
import pandas as pd

states = {
    'State': ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District Of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"], 
    'Code': ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
}  
df = pd.DataFrame(states)

# Retrieved from U.S. Census Bureau (https://www.census.gov/quickfacts)
datasets = [
    ['PST045221', 'Population Estimate'],
    ['SEX255220', 'Sex (Female persons, percent)'],
    ['RHI125220', 'Race (White Alone)'],
    ['RHI225220', 'Race (Black Alone)'],
    ['RHI425220', 'Race (Asian Alone)'],
    ['VET605220', 'Veterans'],
    ['HSG445220', 'Owner-occupied housing unit rate'],
    ['HSG495220', 'Median value of owner-occupied housing units'],
    ['HSG860220', 'Median gross rent'],
    ['HSD310220', 'Persons per household'],
    ['POP715220', 'Living in same house 1 year ago'],
    ['COM100220', 'Households with a computer'],
    ['INT100220', 'Households with a broadband internet subscription'],
    ['EDU635220', 'High school graduate or higher, percent above 25 years'],
    ['EDU685220', 'Bachelors degree or higher, percent above 25 years'],
    ['DIS010220', 'With a disability, percent under 25 years'],
    ['HEA775220', 'Without health insurance, percent under 25 years'],
    ['LFE041220', 'In civilian labour force, total, percent above 16 years'],
    ['LFE046220', 'In civilian labour force, female, percent above 16 years'],
    ['AFN120212', 'Total accomodation and food services sales ($1000) 2012'],
    ['HCN010212', 'Total health care and social assistance revenue ($1000) 2012'],
    ['MAN450212', 'Total manufacturers shipments ($1000) 2012'],
    ['LFE305220', 'Mean time to travel to work (minutes)'],
    ['INC110220', 'Median household income'],
    ['INC910220', 'Per capita income'],
    ['IPE120220', 'Persons in poverty, percent'],
    ['BZA010219', 'Total employer establishments'],
    ['BZA110219', 'Total employment'],
    ['BZA210219', 'Total annual payroll ($1000)'],
    ['SBO010212', 'Men-owned firms'],
    ['SBO020212', 'Women-owned firms'],
    ['SBO030212', 'Minority-owned firms'],
    ['SBO040212', 'Nonminority-owned firms'],
    ['LND110210', 'Land area in square miles']
]

import requests
import json

for dataset in datasets:
    print(f"Retrieving {dataset[0]} - {dataset[1]} dataset...")
    data = requests.get(f"https://www.census.gov/quickfacts/geo/json/00/{dataset[0]}").json()
    state_dict = data['data']
    for state in state_dict:
        df.loc[df.index[df['Code'] == state_dict[state]['shortName']], dataset[1]] = state_dict[state]['data'][dataset[0]]['value']

Retrieving PST045221 - Population Estimate dataset...
Retrieving SEX255220 - Sex (Female persons, percent) dataset...
Retrieving RHI125220 - Race (White Alone) dataset...
Retrieving RHI225220 - Race (Black Alone) dataset...
Retrieving RHI425220 - Race (Asian Alone) dataset...
Retrieving VET605220 - Veterans dataset...
Retrieving HSG445220 - Owner-occupied housing unit rate dataset...
Retrieving HSG495220 - Median value of owner-occupied housing units dataset...
Retrieving HSG860220 - Median gross rent dataset...
Retrieving HSD310220 - Persons per household dataset...
Retrieving POP715220 - Living in same house 1 year ago dataset...
Retrieving COM100220 - Households with a computer dataset...
Retrieving INT100220 - Households with a broadband internet subscription dataset...
Retrieving EDU635220 - High school graduate or higher, percent above 25 years dataset...
Retrieving EDU685220 - Bachelors degree or higher, percent above 25 years dataset...
Retrieving DIS010220 - With a disability,

In [2]:
# Data retrieved from U.S. FBI Uniform Crime Reporting (https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/downloads/download-printable-files)
csv_data = pd.read_csv("data/Table_69_Arrest_by_State_2019.csv")
csv_data = csv_data.reset_index()

crimes = [
    "Total all classes", 
    "Violent crime", 
    "Property crime", 
    "Murder and nonnegligent manslaughter", 
    "Rape", 
    "Robbery", 
    "Aggravated assault", 
    "Burglary", 
    "Larceny- theft", 
    "Motor vehicle theft",
    "Arson",
    "Other assaults",
    "Forgery and counterfeiting",
    "Fraud",
    "Embezzlement",
    "Stolen property",
    "Vandalism",
    "Weapons",
    "Prostitution",
    "Sex offenses (except rape and prostitution)",
    "Drug abuse violations",
    "Gambling",
    "Offenses against the family and children",
    "Driving under the influence",
    "Liquor laws",
    "Drunkenness",
    "Disorderly conduct",
    "Vagrancy",
    "All other offenses (except traffic)",
    "Suspicion",
    "Curfew and loitering law violations"
]

for crime in crimes:
    print(f"Adding {crime} arrests...")
    for index, row in csv_data.iterrows():
        if not pd.isnull(row['State']):
            #print(csv_data[crime][index+1])
            df.loc[df.index[df['State'].str.upper() == row['State']], crime + " arrests"] = csv_data[crime][index+1]           
#print(df)

Adding Total all classes arrests...
Adding Violent crime arrests...
Adding Property crime arrests...
Adding Murder and nonnegligent manslaughter arrests...
Adding Rape arrests...
Adding Robbery arrests...
Adding Aggravated assault arrests...
Adding Burglary arrests...
Adding Larceny- theft arrests...
Adding Motor vehicle theft arrests...
Adding Arson arrests...
Adding Other assaults arrests...
Adding Forgery and counterfeiting arrests...
Adding Fraud arrests...
Adding Embezzlement arrests...
Adding Stolen property arrests...
Adding Vandalism arrests...
Adding Weapons arrests...
Adding Prostitution arrests...
Adding Sex offenses (except rape and prostitution) arrests...
Adding Drug  abuse violations arrests...
Adding Gambling arrests...
Adding Offenses against the family and children arrests...
Adding Driving under the influence arrests...
Adding Liquor laws arrests...
Adding Drunkenness arrests...
Adding Disorderly conduct arrests...
Adding Vagrancy arrests...
Adding All other offenses

In [3]:
# Data retrieved from The Cook Political Report (https://www.cookpolitical.com/2020-national-popular-vote-tracker)
csv_data = pd.read_csv("data/2020-national-popular-vote.csv")
csv_data = csv_data.reset_index()

print("Adding electoral results...")
for index, row in csv_data.iterrows():
    df.loc[df.index[df['State'] == row['state']], "Electoral Result"] = row['called']  


Adding electoral results...


In [4]:
# Save to csv
df.to_csv('collected_data.csv', index=False)
print("Successfully saved to csv")

Successfully saved to csv
