In [1]:
# Code borrowed from https://github.com/anisfeld/MachineLearning/blob/master/Diagnostic/ML%20HW0-1.ipynb
import numpy as np
import pandas as pd
import requests
from urllib.parse import quote_plus, urlencode

In [219]:
BASE_URL = "https://data.cityofchicago.org/resource/6zsd-86xi.json?$query="


#https://data.cityofchicago.org/resource/6zsd-86xi.json?$?iucr=110
        
def encode_query(dict, base_url=BASE_URL):
    '''
    Take dictionary with SQL statements parts and combine them into a web-safe string
    
    e.g.
    dict = {"SELECT": ["community_area",
                    "latitude",
                    "longitude",
                    "date"], 
            "WHERE": "iucr in ('2024','2025')",
            "LIMIT": "10"}
            
    returns: 'https://data.cityofchicago.org/resource/6zsd-86xi.json?$query=SELECT+community_area%2C+
              latitude%2C+longitude%2C+date+WHERE+iucr+in+%28%272024%27%2C%272025%27%29+LIMIT+10+'
    '''
    encoded = ""
    for key, val in dict.items():
        if isinstance(val, list):
            try:
                val = ", ".join(val)
            except:
                return "Lists must contain strings only"
        try:
            encoded += key + " " + val + " "
        except:
            return "Dictionary values must be strings or list of strings"
    return base_url + quote_plus(encoded)


def call_socrata_api(query_dict, base_url=BASE_URL, endpoint=None, verbose=False):
    '''
    Inputs: endpoint (string) (e.g.: "yama-9had.json?", optional if base_url doesn't include one) 
            query_dict (dictionary) (SoQL statements divided into constituent parts)
    '''
    
    # Ensure all data is downloaded in one API call (requires endpoint version 2.1)
    count_query = base_url + "SELECT%20count(*)"
    r = requests.get(count_query)
    query_dict["LIMIT"] = r.json()[0]["count"]
    
    query_url = encode_query(query_dict, base_url)

    if verbose:
        print("query url: ", query_url)
    
    return pd.read_json(query_url)


def build_query_dict(code_data, base_query, code_col="IUCR", soql_name="iucr", group_col="Type"):
    '''
    Put information about crime codes into usable format.
    
    code_data (pd.Dataframe) with two columns
    
    return: dict of form:
    {'Aggravated assault/battery': {'SELECT': ['id',
                                               'date',
                                               'iucr',
                                               'latitude',
                                               'longitude',
                                               'ward',
                                               'community_area',
                                               'district'],
                                    'WHERE': "iucr in ('141A', '141B')"},
    ...}
    '''
    soql_dict = {}
    types = code_data.groupby([group_col])
    for key,val in types.groups.items():
        soql_dict[key] = base_query.copy()
        codes = chicago_codes(code_data.ix[val,code_col])
        where = soql_name + " in "  + str(tuple(codes))
        soql_dict[key]['WHERE'] = where
    return soql_dict


def chicago_codes(col):
    return [str(x) if len(str(x)) == 4 else '0' + str(x) for x in col]


def make_crime_data_frame(query_dict, verbose=True):
    '''
    Idiosyncratic command that collects crime data and cleans up pandas dataframe 
    '''
    df = pd.DataFrame()
    for key, query in query_dict.items():
        temp_df = call_socrata_api(query, verbose=True)
        temp_df[key] = 1
        df = pd.concat([df,temp_df], axis=0)
    return df


In [246]:
# Import IUCR codes with names.
code_data = pd.read_excel('Crime definitions.xlsx')
code_data.fillna(0, inplace=True)
crime_types = list(query_dict.keys())

Violent = ['Aggravated assault/battery', 'Criminal sexual assault', 'Homicide', 'Robbery']
Property = ['Arson','Burglary', 'Motor vehicle theft','Larceny']


# Build mapping from name to IUCR code
base_query = {"SELECT": ["id",
                        "date",
                        "iucr",
                        "latitude",
                        "longitude",
                        "ward",
                        "community_area",
                        "district"]}

query_dict = build_query_dict(code_data, base_query)

# pull crime data from Socrata
#crime_data = make_crime_data_frame(query_dict)

#crime_data=crime_data.reset_index(drop=True)
crime_data.ix[:,crime_types] = crime_data.ix[:,crime_types ].fillna(0)

crime_data["Violent"] = crime_data[Violent].sum(1)
crime_data["Property"] = crime_data[Property].sum(1)
crime_data["Index"] = crime_data[["Violent","Property"]].sum(1)


public = code_data[["Public"]]
public["IUCR"] = pd.DataFrame(chicago_codes(code_data["IUCR"]))

crime_data = pd.merge(crime_data,public, left_on="iucr", right_on="IUCR")
crime_data.drop(["IUCR"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [252]:
crime_data.to_csv("Chicago_Crimes.csv")

In [None]:
def get_com_areas(df, right_on="area_numbe"):
    '''
    downloads community areas information from data portal and appends to data frame
    '''
    try:
        community_areas=pd.read_json("https://data.cityofchicago.org/resource/igwz-8jzy.json")
    except:
        return "download failed"
    
    com_areas = community_areas.ix[:,["area_numbe","community"]]
    df = pd.merge(df, com_areas, left_on="community_area", right_on=right_on)
    return df

g = code_dict.groupby(['Type'])
gr = g.groups['Aggravated assault/battery']
str(tuple([str(x) for x in code_dict.ix[gr,"IUCR"]]))
g.groups