# Data Extraction Notebook

This notebook extracts the chicago crime dataset and makes it a dataframe. Because of the size of the dataframe, we sample a subset of the data. This notebook produced pickled files of the dataframes.


In [None]:
!pip install sodapy==2.1.0
import pandas as pd
from sodapy import Socrata
import numpy as np
from tqdm import tqdm

Collecting sodapy==2.1.0
  Downloading sodapy-2.1.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading sodapy-2.1.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


### Utility functions

In [None]:
'''
Dataset is huge, so to get samples from approximately all the year we use offset
'''

def get_part_data(client, idx):
  ret = client.get("ijzp-q8t2", offset=idx,limit=500)
  ret = pd.DataFrame.from_records(ret)

  return ret


### Download the data

In [None]:
# Get data
client = Socrata("data.cityofchicago.org", None,timeout=60)

offsets = np.linspace(0, 1000000, num=100, dtype=np.int64)
data = []
for offset in tqdm(offsets):
  data.append(get_part_data(client, offset))



df = pd.concat(data, ignore_index=True)
df = df.loc[:, :'location']

100%|██████████| 100/100 [04:54<00:00,  2.94s/it]


### Data preprocessing

In [None]:
# Parse date and time
df['date'] = pd.to_datetime(df['date'])
df['updated_on'] = pd.to_datetime(df['updated_on'])

In [None]:
# Remove nan rows
df = df.dropna()
df

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,13654954,JH497594,2024-11-01 00:00:00,043XX W MONTROSE AVE,0710,THEFT,THEFT FROM MOTOR VEHICLE,PARKING LOT / GARAGE (NON RESIDENTIAL),False,False,...,39,16,06,1146714,1928893,2024,2024-11-08 15:41:24,41.960858577,-87.735994826,"{'latitude': '41.960858577', 'longitude': '-87..."
1,13651154,JH492982,2024-11-01 00:00:00,032XX W 55TH ST,0320,ROBBERY,STRONG ARM - NO WEAPON,STREET,False,False,...,14,63,03,1155548,1867964,2024,2024-11-08 15:41:24,41.793489634,-87.70515582,"{'latitude': '41.793489634', 'longitude': '-87..."
2,13650227,JH491940,2024-11-01 00:00:00,089XX S LAFLIN ST,0560,ASSAULT,SIMPLE,RESIDENCE,False,True,...,21,73,08A,1167901,1845454,2024,2024-11-08 15:41:24,41.731462584,-87.660503907,"{'latitude': '41.731462584', 'longitude': '-87..."
3,13650446,JH491862,2024-11-01 00:00:00,019XX S STATE ST,1320,CRIMINAL DAMAGE,TO VEHICLE,RESIDENCE - GARAGE,False,False,...,3,33,14,1176627,1890987,2024,2024-11-08 15:41:24,41.856218005,-87.627167159,"{'latitude': '41.856218005', 'longitude': '-87..."
4,13649598,JH490987,2024-11-01 00:00:00,034XX W 65TH PL,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,14,66,08B,1154658,1860975,2024,2024-11-08 15:41:24,41.77432858,-87.708605612,"{'latitude': '41.77432858', 'longitude': '-87...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,12149537,JD348376,2020-08-28 16:30:00,035XX W 26TH ST,0860,THEFT,RETAIL THEFT,DRUG STORE,False,False,...,22,30,06,1153305,1886502,2020,2020-09-04 15:40:59,41.844405072,-87.712889949,"{'latitude': '41.844405072', 'longitude': '-87..."
49996,12152992,JD351499,2020-08-28 16:30:00,079XX S INGLESIDE AVE,0850,THEFT,ATTEMPT THEFT,RESIDENCE,False,False,...,8,44,06,1183939,1852562,2020,2020-09-04 15:40:59,41.750608654,-87.601529598,"{'latitude': '41.750608654', 'longitude': '-87..."
49997,12150270,JD349275,2020-08-28 16:30:00,053XX W BLOOMINGDALE AVE,0820,THEFT,$500 AND UNDER,VEHICLE NON-COMMERCIAL,False,False,...,37,25,06,1140633,1911454,2020,2020-09-04 15:40:59,41.913118361,-87.758781697,"{'latitude': '41.913118361', 'longitude': '-87..."
49998,12149588,JD348492,2020-08-28 16:30:00,0000X W ERIE ST,0820,THEFT,$500 AND UNDER,STREET,False,False,...,42,8,06,1176097,1904775,2020,2020-09-04 15:40:59,41.894065054,-87.628697034,"{'latitude': '41.894065054', 'longitude': '-87..."


In [None]:
# Split debugging and training data
debug = df.sample(n=1000, random_state=42)

In [None]:
# Convert data to pickle
df.to_pickle('CrimeData')
debug.to_pickle('Debug')