# Cleaning up messy data

## Install

In [1]:
SYFT_VERSION = ">=0.8.1b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
# !pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

In [2]:
import syft as sy
sy.requires(SYFT_VERSION)



✅ The installed version of syft==0.8.1b2 matches the requirement >=0.8.1b0 and the requirement <0.9


In [7]:
node = sy.orchestra.launch(name="pandas-test-domain-1", reset=True)

SQLite Store Path:
!open file:///var/folders/q1/ryq93kwj055dlbpngxv1c7z40000gn/T/281e55cb3425360e464682cfb4672fcb.sqlite



# Data owner: Upload data

In [8]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

## Load data

In [5]:
# The usual preamble
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from syft.service.project.project import Project
from syft.util.util import autocache, PANDAS_DATA

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

One of the main problems with messy data is: how do you know if it's messy or not?

We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.

## Create mock data

In [6]:
service_requests = pd.read_csv(autocache(f"{PANDAS_DATA}/311-service-requests.csv"), dtype='unicode')

In [7]:
service_requests.head(100)

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
0,26589651,10/31/2013 02:08:41 AM,,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk,11432,90-03 169 STREET,169 STREET,90 AVENUE,91 AVENUE,,,ADDRESS,JAMAICA,,Precinct,Assigned,10/31/2013 10:08:41 AM,10/31/2013 02:35:17 AM,12 QUEENS,QUEENS,1042027,197389,Unspecified,QUEENS,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.70827532593202,-73.79160395779721,"(40.70827532593202, -73.79160395779721)"
1,26593698,10/31/2013 02:01:04 AM,,NYPD,New York City Police Department,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,11378,58 AVENUE,58 AVENUE,58 PLACE,59 STREET,,,BLOCKFACE,MASPETH,,Precinct,Open,10/31/2013 10:01:04 AM,,05 QUEENS,QUEENS,1009349,201984,Unspecified,QUEENS,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.721040535628305,-73.90945306791765,"(40.721040535628305, -73.90945306791765)"
2,26594139,10/31/2013 02:00:24 AM,10/31/2013 02:40:32 AM,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,10032,4060 BROADWAY,BROADWAY,WEST 171 STREET,WEST 172 STREET,,,ADDRESS,NEW YORK,,Precinct,Closed,10/31/2013 10:00:24 AM,10/31/2013 02:39:42 AM,12 MANHATTAN,MANHATTAN,1001088,246531,Unspecified,MANHATTAN,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.84332975466513,-73.93914371913482,"(40.84332975466513, -73.93914371913482)"
3,26595721,10/31/2013 01:56:23 AM,10/31/2013 02:21:48 AM,NYPD,New York City Police Department,Noise - Vehicle,Car/Truck Horn,Street/Sidewalk,10023,WEST 72 STREET,WEST 72 STREET,COLUMBUS AVENUE,AMSTERDAM AVENUE,,,BLOCKFACE,NEW YORK,,Precinct,Closed,10/31/2013 09:56:23 AM,10/31/2013 02:21:10 AM,07 MANHATTAN,MANHATTAN,989730,222727,Unspecified,MANHATTAN,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.7780087446372,-73.98021349023975,"(40.7780087446372, -73.98021349023975)"
4,26590930,10/31/2013 01:53:44 AM,,DOHMH,Department of Health and Mental Hygiene,Rodent,Condition Attracting Rodents,Vacant Lot,10027,WEST 124 STREET,WEST 124 STREET,LENOX AVENUE,ADAM CLAYTON POWELL JR BOULEVARD,,,BLOCKFACE,NEW YORK,,,Pending,11/30/2013 01:53:44 AM,10/31/2013 01:59:54 AM,10 MANHATTAN,MANHATTAN,998815,233545,Unspecified,MANHATTAN,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.80769092704951,-73.94738703491433,"(40.80769092704951, -73.94738703491433)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,26592691,10/30/2013 11:36:44 PM,10/31/2013 01:05:44 AM,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Store/Commercial,10005,55 WALL STREET,WALL STREET,WILLIAM STREET,HANOVER STREET,,,ADDRESS,NEW YORK,,Precinct,Closed,10/31/2013 07:36:44 AM,10/31/2013 01:05:44 AM,01 MANHATTAN,MANHATTAN,981709,196561,Unspecified,MANHATTAN,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.70619093472265,-74.00916484564914,"(40.70619093472265, -74.00916484564914)"
96,26593487,10/30/2013 11:34:49 PM,,NYPD,New York City Police Department,Panhandling,,Street/Sidewalk,11418,135-16 JAMAICA AVENUE,JAMAICA AVENUE,METROPOLITAN AVENUE,VAN WYCK EXPRESSWAY,,,ADDRESS,RICHMOND HILL,,Precinct,Open,10/31/2013 07:34:49 AM,,09 QUEENS,QUEENS,1034900,195238,Unspecified,QUEENS,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.70241501845674,-73.81732645592662,"(40.70241501845674, -73.81732645592662)"
97,26595761,10/30/2013 11:33:36 PM,,NYPD,New York City Police Department,Noise - Vehicle,Car/Truck Horn,Street/Sidewalk,10458,2995 BOTANICAL SQUARE,BOTANICAL SQUARE,BEND,,,,ADDRESS,BRONX,,Precinct,Assigned,10/31/2013 07:33:36 AM,10/31/2013 01:14:48 AM,07 BRONX,BRONX,1016994,255432,Unspecified,BRONX,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.86771566456841,-73.88161246880681,"(40.86771566456841, -73.88161246880681)"
98,26595305,10/30/2013 11:32:25 PM,10/31/2013 01:43:13 AM,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,10003,,,,,SECOND AVENUE,EAST 14 STREET,INTERSECTION,NEW YORK,,Precinct,Closed,10/31/2013 07:32:25 AM,10/31/2013 01:43:13 AM,03 MANHATTAN,MANHATTAN,988424,206091,Unspecified,MANHATTAN,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,40.73234787144493,-73.98493936163591,"(40.73234787144493, -73.98493936163591)"


In [8]:
rows_with_dashes = service_requests['Incident Zip'].str.contains('-').fillna(False)
service_requests[rows_with_dashes]

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
29136,26550551,10/24/2013 06:16:34 PM,,DCA,Department of Consumer Affairs,Consumer Complaint,False Advertising,,77092-2016,2700 EAST SELTICE WAY,EAST SELTICE WAY,,,,,,HOUSTON,,,Assigned,11/13/2013 11:15:20 AM,10/29/2013 11:16:16 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
30939,26548831,10/24/2013 09:35:10 AM,,DCA,Department of Consumer Affairs,Consumer Complaint,Harassment,,55164-0737,P.O. BOX 64437,64437,,,,,,ST. PAUL,,,Assigned,11/13/2013 02:30:21 PM,10/29/2013 02:31:06 PM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
70539,26488417,10/15/2013 03:40:33 PM,,TLC,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,Street,11549-3650,365 HOFSTRA UNIVERSITY,HOFSTRA UNIVERSITY,,,,,,HEMSTEAD,,,Assigned,11/30/2013 01:20:33 PM,10/16/2013 01:21:39 PM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,La Guardia Airport,,,,,,,,,,
85821,26468296,10/10/2013 12:36:43 PM,10/26/2013 01:07:07 AM,DCA,Department of Consumer Affairs,Consumer Complaint,Debt Not Owed,,29616-0759,PO BOX 25759,BOX 25759,,,,,,GREENVILLE,,,Closed,10/26/2013 09:20:28 AM,10/26/2013 01:07:07 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
89304,26461137,10/09/2013 05:23:46 PM,10/25/2013 01:06:41 AM,DCA,Department of Consumer Affairs,Consumer Complaint,Harassment,,35209-3114,600 BEACON PKWY,BEACON PKWY,,,,,,BIRMINGHAM,,,Closed,10/25/2013 02:43:42 PM,10/25/2013 01:06:41 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,


In [9]:
import random
def get_unique_key():
    return random.randint(0,1000000)
    
def get_mock_location():
    return random.uniform(-90, 90)

def get_zip_code():
    zip = random.randint(10000,11000)
    if zip > 10990:
        zip = str(zip) + '-1234'
    return str(zip)

def get_mock_row(i):
    res = dict()
    for k, function in mock_functions.items():
        res[k] = function()
    return res
    

In [10]:
# make mock as close to the original data as possible!!
# TODO: Make it the same as the OG dataframe
mock_functions = {
    "Unique Key": lambda: get_unique_key(),
    'Longitude': lambda: random.uniform(-90, 90),
    'Latitude': lambda: random.uniform(-90, 90),
    'Incident Zip': lambda: get_zip_code(),
    'City': lambda: random.choice(["BROOKLYN", "NEW YORK", "BRONX"])
}

In [11]:
mock = pd.DataFrame(data=[get_mock_row(i) for i in range(len(service_requests))],
                    columns=service_requests.columns)

In [12]:
mock

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
0,612181,,,,,,,,10187,,,,,,,,BROOKLYN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.774987,8.291166,
1,713406,,,,,,,,10014,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-34.040373,-64.558004,
2,925242,,,,,,,,10298,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,37.426159,11.651187,
3,364755,,,,,,,,10528,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-51.514745,-80.884992,
4,607116,,,,,,,,10324,,,,,,,,BROOKLYN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.539656,36.299214,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111064,653130,,,,,,,,10843,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-64.393793,18.498081,
111065,761659,,,,,,,,10983,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-59.850236,66.601919,
111066,866194,,,,,,,,10226,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-46.691337,-62.861331,
111067,471956,,,,,,,,10094,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,44.987801,86.983212,


Upload the data

In [13]:
dataset = sy.Dataset(name="test", asset_list=[sy.Asset(name=f"service_requests",
                                                       data=service_requests[:1000],
                                                       mock=mock[:1000], # TODO: remove :1000
                                                       mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


  0%|                                                                                                                 | 0/1 [00:00<?, ?it/s]

Uploading: service_requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.43it/s]


## Create user account

In [14]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")
# todo: give user data scientist role
guest_domain_client = node.client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create result pointer

In [15]:
import numpy as np
import pandas as pd

## Summary
By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

## Get mocks

In [16]:
ds = guest_domain_client.datasets[0]

In [17]:
asset = ds.assets[0]

In [18]:
requests = asset.mock

## How do we know if it's messy?
We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.

To get a sense for whether a column has problems, I usually use `.unique()` to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.

When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.

Some of the problems:

- Some have been parsed as strings, and some as floats
- There are `nans`
- Some of the zip codes are `29616-0759` or `83`
- There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE'

What we can do:

- Normalize 'N/A' and 'NO CLUE' into regular nan values
- Look at what's up with the 83, and decide what to do
- Make everything strings

In [19]:
res = requests['Incident Zip'].unique()

## Fixing the nan values and string/float confusion
We can pass a na_values option to pd.read_csv to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.



In [20]:
na_values = ['NO CLUE', 'N/A', '0']
requests.replace(na_values, np.NaN);

In [21]:
requests['Incident Zip'].unique()

```python
TwinPointer(Mock)
```
array(['10187', '10014', '10298', '10528', '10324', '10959', '10159',
       '10138', '10325', '10970', '10981', '10831', '10319', '10483',
       '10441', '10702', '10432', '10394', '10294', '10920', '10488',
       '10404', '10360', '10860', '10879', '10639', '10418', '10695',
       '10344', '10349', '10853', '10040', '10815', '10782', '10204',
       '10978', '10156', '10323', '10503', '10993-1234', '10111', '10192',
       '10834', '10160', '10828', '10688', '10217', '10821', '10601',
       '10415', '10720', '10002', '10850', '10163', '10072', '10809',
       '10651', '10210', '10917', '10400', '10963', '10188', '10759',
       '10698', '10157', '10456', '10789', '10578', '10727', '10357',
       '10335', '10284', '10501', '10849', '10632', '10717', '10048',
       '10128', '10100', '10760', '10842', '10031', '10161', '10362',
       '10579', '10952', '10774', '10140', '10615', '10663', '10374',
       '10248', '10862', '10522', '10826', '10307', '10666', '10486',
       '10213', '10045', '10024', '10490', '10260', '10758', '10009',
       '10437', '10724', '10127', '10840', '10179', '10942', '10570',
       '10245', '10358', '10985', '10571', '10391', '10949', '10209',
       '10103', '10798', '10136', '10205', '10254', '10454', '10900',
       '10293', '10538', '10020', '10870', '10577', '10274', '10612',
       '10057', '10564', '10223', '10468', '10243', '10029', '10112',
       '10405', '10511', '10478', '10672', '10288', '10008', '10343',
       '10761', '10595', '10932', '10015', '10457', '10401', '10304',
       '10962', '10530', '10667', '10276', '10172', '10650', '10957',
       '10076', '10499', '10818', '10675', '10773', '10010', '10536',
       '10308', '10477', '10593', '10805', '10290', '10073', '10683',
       '10584', '10956', '10509', '10830', '10332', '10895', '10547',
       '10068', '10731', '10766', '10212', '10251', '10421', '10214',
       '10855', '10525', '10425', '10649', '10130', '10514', '10676',
       '10476', '10987', '10018', '10677', '10775', '10563', '10256',
       '10052', '10125', '10928', '10747', '10975', '10620', '10613',
       '10451', '10034', '10435', '10817', '10459', '10640', '10158',
       '10510', '10883', '10691', '10318', '10822', '10641', '10096',
       '10097', '10930', '10329', '10891', '10386', '10909', '10094',
       '10389', '10095', '10345', '10168', '10728', '10542', '10630',
       '10811', '10854', '10462', '10408', '10267', '10306', '10783',
       '10969', '10750', '10913', '10999-1234', '10028', '10504', '10635',
       '10361', '10861', '10935', '10491', '10154', '10836', '10631',
       '10916', '10060', '10662', '10433', '10786', '10105', '10769',
       '10321', '10565', '10642', '10876', '10356', '10685', '10692',
       '10527', '10832', '10597', '10670', '10838', '10574', '10039',
       '10380', '10897', '10047', '10926', '10881', '10575', '10768',
       '10277', '10938', '10548', '10941', '10541', '10955', '10065',
       '10077', '10244', '10152', '10022', '10221', '10041', '10770',
       '10481', '10971', '10544', '10412', '10823', '10327', '10229',
       '10203', '10801', '10053', '10953', '10364', '10516', '10313',
       '10684', '10373', '10121', '10122', '10189', '10363', '10201',
       '10586', '10545', '10931', '10986', '10079', '10198', '10185',
       '10634', '10153', '10417', '10439', '10280', '10977', '10512',
       '10665', '10589', '10703', '10493', '10264', '10007', '10474',
       '10236', '10258', '10894', '10464', '10576', '10873', '10467',
       '10606', '10240', '10673', '10336', '10449', '10502', '10194',
       '10756', '10958', '10812', '10893', '10133', '10604', '10119',
       '10165', '10908', '10741', '10475', '10596', '10609', '10484',
       '10948', '10102', '10540', '10107', '10301', '10696', '10933',
       '10113', '10118', '10814', '10513', '10929', '10950', '10585',
       '10393', '10278', '10027', '10699', '10139', '10084', '10771',
       '10594', '10151', '10687', '10208', '10471', '10989', '10443',
       '10737', '10810', '10376', '10902', '10847', '10535', '10372',
       '10690', '10285', '10656', '10108', '10657', '10230', '10653',
       '10069', '10252', '10384', '10019', '10954', '10452', '10636',
       '10381', '10763', '10104', '10302', '10779', '10369', '10967',
       '10216', '10036', '10800', '10061', '10865', '10469', '10961',
       '10115', '10592', '10337', '10992-1234', '10725', '10824', '10721',
       '10033', '10972', '10752', '10875', '10070', '10500', '10368',
       '10000', '10558', '10035', '10973', '10005', '10839', '10863',
       '10534', '10371', '10569', '10261', '10142', '10424', '10643',
       '10524', '10042', '10162', '10694', '10572', '10282', '10370',
       '10003', '10716', '10473', '10231', '10552', '10866', '10537',
       '10743', '10375', '10506', '10644', '10178', '10200', '10550',
       '10682', '10461', '10623', '10947', '10887', '10268', '10398',
       '10411', '10580', '10560', '10054', '10492', '10898', '10453',
       '10359', '10149', '10788', '10182', '10046', '10723', '10735',
       '10745', '10629', '10679', '10520', '10567', '10175', '10924',
       '10796', '10101', '10827', '10396', '10582', '10781', '10792',
       '10995-1234', '10382', '10171', '10813', '10546', '10890', '10056',
       '10338', '10864', '10341', '10808', '10174', '10674', '10934',
       '10117', '10736', '10884', '10013', '10247', '10222', '10845',
       '10790', '10310', '10166', '10803', '10399', '10272', '10614',
       '10968', '10445', '10787', '10017', '10246', '10976', '10026',
       '10638', '10438', '10062', '10718', '10523', '10334', '10465',
       '10098', '10074', '10531', '10994-1234', '10255', '10668', '10859',
       '10561', '10515', '10543', '10555', '10844', '10106', '10434',
       '10712', '10785', '10498', '10271', '10012', '10797', '10025',
       '10479', '10705', '10882', '10450', '10120', '10700', '10199',
       '10144', '10109', '10607', '10211', '10190', '10446', '10867',
       '10420', '10868', '10146', '10980', '10224', '10219', '10776',
       '10556', '10092', '10419', '10395', '10602', '10856', '10846',
       '11000-1234', '10746', '10880', '10099', '10237', '10090', '10044',
       '10872', '10116', '10599', '10126', '10082', '10383', '10693',
       '10124', '10387', '10937', '10088', '10804', '10300', '10043',
       '10317', '10207', '10299', '10233', '10726', '10974', '10480',
       '10554', '10497', '10049'], dtype=object)

## What's up with the dashes?

In [22]:
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])

13

In [23]:
requests[rows_with_dashes]

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
40,345752,,,,,,,,10993-1234,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52.692885,-11.267392,
289,910830,,,,,,,,10999-1234,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-64.673181,81.083536,
566,684397,,,,,,,,10993-1234,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25.761696,-33.512858,
574,104186,,,,,,,,10992-1234,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-30.131025,31.949946,
620,640808,,,,,,,,10992-1234,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-32.906008,-5.679819,
704,534236,,,,,,,,10992-1234,,,,,,,,BROOKLYN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-10.585313,24.252235,
711,570694,,,,,,,,10995-1234,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,55.632304,82.598033,
751,715653,,,,,,,,10992-1234,,,,,,,,BROOKLYN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.123099,4.840987,
791,454375,,,,,,,,10995-1234,,,,,,,,NEW YORK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-4.585399,-48.382215,
814,938498,,,,,,,,10994-1234,,,,,,,,BRONX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.012553,-59.859291,


I thought these were missing data and originally deleted them like this:

`requests['Incident Zip'][rows_with_dashes] = np.nan`

But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.

In [24]:
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()

```python
TwinPointer(Mock)
```
array(['10993-1234', '10999-1234', '10992-1234', '10995-1234',
       '10994-1234', '11000-1234'], dtype=object)

Those all look okay to truncate to me.

In [25]:
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

Done.

Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that.

In [26]:
requests[requests['Incident Zip'] == '00000'] 

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location


This looks bad to me. Let's set these to nan.

**note**: this uses `requests[zero_zips, 'Incident Zip']` instead of `requests.loc[zero_zips, 'Incident Zip']`. This is because of a limitation of the current version of syft. If we have a pointer and we do ptr.a, we actually get a copy of a. If we then update a we are actually updating the copy, and not the ptr. This is a problem for `DataFrame.loc[x]`.

In [27]:
zero_zips = requests['Incident Zip'] == '00000'
# this is currently not possible
# requests.loc[zero_zips, 'Incident Zip'] = np.nan

# this is the workaround
incident_zips = requests['Incident Zip']
incident_zips[zero_zips] = np.nan

requests['Incident Zip'] = incident_zips

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  local_func(*original_args, **original_kwargs)


Great. Let's see where we are now:

**This is supposed to error**

In [28]:
# this is supposed to error
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips

send_action_side_effect failed with Got back unexpected response : Failed executing action ActionObject NumpyArrayObject[6b0..e].sort(,), result is an error: Failed executing action ActionObject NumpyArrayObject[6b0..e].sort(,), result is an error: '<' not supported between instances of 'NoneType' and 'str'
 Traceback (most recent call last):
  File "/Users/koen/workspace/PySyft/packages/syft/src/syft/service/action/action_object.py", line 330, in send_action_side_effect
    raise RuntimeError(f"Got back unexpected response : {action_result}")
RuntimeError: Got back unexpected response : Failed executing action ActionObject NumpyArrayObject[6b0..e].sort(,), result is an error: Failed executing action ActionObject NumpyArrayObject[6b0..e].sort(,), result is an error: '<' not supported between instances of 'NoneType' and 'str'



```python
TwinPointer(Mock)
```
array(['10000', '10002', '10003', '10005', '10007', '10008', '10009',
       '10010', '10012', '10013', '10014', '10015', '10017', '10018',
       '10019', '10020', '10022', '10024', '10025', '10026', '10027',
       '10028', '10029', '10031', '10033', '10034', '10035', '10036',
       '10039', '10040', '10041', '10042', '10043', '10044', '10045',
       '10046', '10047', '10048', '10049', '10052', '10053', '10054',
       '10056', '10057', '10060', '10061', '10062', '10065', '10068',
       '10069', '10070', '10072', '10073', '10074', '10076', '10077',
       '10079', '10082', '10084', '10088', '10090', '10092', '10094',
       '10095', '10096', '10097', '10098', '10099', '10100', '10101',
       '10102', '10103', '10104', '10105', '10106', '10107', '10108',
       '10109', '10111', '10112', '10113', '10115', '10116', '10117',
       '10118', '10119', '10120', '10121', '10122', '10124', '10125',
       '10126', '10127', '10128', '10130', '10133', '10136', '10138',
       '10139', '10140', '10142', '10144', '10146', '10149', '10151',
       '10152', '10153', '10154', '10156', '10157', '10158', '10159',
       '10160', '10161', '10162', '10163', '10165', '10166', '10168',
       '10171', '10172', '10174', '10175', '10178', '10179', '10182',
       '10185', '10187', '10188', '10189', '10190', '10192', '10194',
       '10198', '10199', '10200', '10201', '10203', '10204', '10205',
       '10207', '10208', '10209', '10210', '10211', '10212', '10213',
       '10214', '10216', '10217', '10219', '10221', '10222', '10223',
       '10224', '10229', '10230', '10231', '10233', '10236', '10237',
       '10240', '10243', '10244', '10245', '10246', '10247', '10248',
       '10251', '10252', '10254', '10255', '10256', '10258', '10260',
       '10261', '10264', '10267', '10268', '10271', '10272', '10274',
       '10276', '10277', '10278', '10280', '10282', '10284', '10285',
       '10288', '10290', '10293', '10294', '10298', '10299', '10300',
       '10301', '10302', '10304', '10306', '10307', '10308', '10310',
       '10313', '10317', '10318', '10319', '10321', '10323', '10324',
       '10325', '10327', '10329', '10332', '10334', '10335', '10336',
       '10337', '10338', '10341', '10343', '10344', '10345', '10349',
       '10356', '10357', '10358', '10359', '10360', '10361', '10362',
       '10363', '10364', '10368', '10369', '10370', '10371', '10372',
       '10373', '10374', '10375', '10376', '10380', '10381', '10382',
       '10383', '10384', '10386', '10387', '10389', '10391', '10393',
       '10394', '10395', '10396', '10398', '10399', '10400', '10401',
       '10404', '10405', '10408', '10411', '10412', '10415', '10417',
       '10418', '10419', '10420', '10421', '10424', '10425', '10432',
       '10433', '10434', '10435', '10437', '10438', '10439', '10441',
       '10443', '10445', '10446', '10449', '10450', '10451', '10452',
       '10453', '10454', '10456', '10457', '10459', '10461', '10462',
       '10464', '10465', '10467', '10468', '10469', '10471', '10473',
       '10474', '10475', '10476', '10477', '10478', '10479', '10480',
       '10481', '10483', '10484', '10486', '10488', '10490', '10491',
       '10492', '10493', '10497', '10498', '10499', '10500', '10501',
       '10502', '10503', '10504', '10506', '10509', '10510', '10511',
       '10512', '10513', '10514', '10515', '10516', '10520', '10522',
       '10523', '10524', '10525', '10527', '10528', '10530', '10531',
       '10534', '10535', '10536', '10537', '10538', '10540', '10541',
       '10542', '10543', '10544', '10545', '10546', '10547', '10548',
       '10550', '10552', '10554', '10555', '10556', '10558', '10560',
       '10561', '10563', '10564', '10565', '10567', '10569', '10570',
       '10571', '10572', '10574', '10575', '10576', '10577', '10578',
       '10579', '10580', '10582', '10584', '10585', '10586', '10589',
       '10592', '10593', '10594', '10595', '10596', '10597', '10599',
       '10601', '10602', '10604', '10606', '10607', '10609', '10612',
       '10613', '10614', '10615', '10620', '10623', '10629', '10630',
       '10631', '10632', '10634', '10635', '10636', '10638', '10639',
       '10640', '10641', '10642', '10643', '10644', '10649', '10650',
       '10651', '10653', '10656', '10657', '10662', '10663', '10665',
       '10666', '10667', '10668', '10670', '10672', '10673', '10674',
       '10675', '10676', '10677', '10679', '10682', '10683', '10684',
       '10685', '10687', '10688', '10690', '10691', '10692', '10693',
       '10694', '10695', '10696', '10698', '10699', '10700', '10702',
       '10703', '10705', '10712', '10716', '10717', '10718', '10720',
       '10721', '10723', '10724', '10725', '10726', '10727', '10728',
       '10731', '10735', '10736', '10737', '10741', '10743', '10745',
       '10746', '10747', '10750', '10752', '10756', '10758', '10759',
       '10760', '10761', '10763', '10766', '10768', '10769', '10770',
       '10771', '10773', '10774', '10775', '10776', '10779', '10781',
       '10782', '10783', '10785', '10786', '10787', '10788', '10789',
       '10790', '10792', '10796', '10797', '10798', '10800', '10801',
       '10803', '10804', '10805', '10808', '10809', '10810', '10811',
       '10812', '10813', '10814', '10815', '10817', '10818', '10821',
       '10822', '10823', '10824', '10826', '10827', '10828', '10830',
       '10831', '10832', '10834', '10836', '10838', '10839', '10840',
       '10842', '10844', '10845', '10846', '10847', '10849', '10850',
       '10853', '10854', '10855', '10856', '10859', '10860', '10861',
       '10862', '10863', '10864', '10865', '10866', '10867', '10868',
       '10870', '10872', '10873', '10875', '10876', '10879', '10880',
       '10881', '10882', '10883', '10884', '10887', '10890', '10891',
       '10893', '10894', '10895', '10897', '10898', '10900', '10902',
       '10908', '10909', '10913', '10916', '10917', '10920', '10924',
       '10926', '10928', '10929', '10930', '10931', '10932', '10933',
       '10934', '10935', '10937', '10938', '10941', '10942', '10947',
       '10948', '10949', '10950', '10952', '10953', '10954', '10955',
       '10956', '10957', '10958', '10959', '10961', '10962', '10963',
       '10967', '10968', '10969', '10970', '10971', '10972', '10973',
       '10974', '10975', '10976', '10977', '10978', '10980', '10981',
       '10985', '10986', '10987', '10989', '10992', '10993', '10994',
       '10995', '10999', '11000'], dtype=object)

Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.

Let's take a closer look:

In [29]:
zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()

In [30]:
zips[is_far]

```python
TwinPointer(Mock)
```
Series([], Name: Incident Zip, dtype: object)

Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.



In [31]:
requests['City'].str.upper().value_counts()

```python
TwinPointer(Mock)
```
BROOKLYN    356
NEW YORK    331
BRONX       313
Name: City, dtype: int64

It looks like these are legitimate complaints, so we'll just leave them alone.

## Putting it together

In [32]:
unique_zips.request(guest_client)

```python
class Request:
  id: str = 85105c1062804b20b7bfc374c8f84762
  requesting_user_verify_key: str = e8fe9a4423dac689c793b029e9ceed704042ba405e362d40cd774adcf9be7fa2
  approving_user_verify_key: str = None
  request_time: str = 2023-05-25 10:54:12
  approval_time: str = None
  status: str = RequestStatus.PENDING
  node_uid: str = 281e55cb3425360e464682cfb4672fcb
  request_hash: str = "e0894b30a3c62e469f3b1b6bc1327dba5703cf8d3235734b9a3b51ca438b3de5"
  changes: str = [syft.service.request.request.ActionStoreChange]

```

Request code execution

# Data owner: approve request

In [33]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

In [34]:
root_domain_client.api.services.request[0].approve()

# Data scientist: fetch result

In [35]:
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

In [36]:
real_result = unique_zips.get_from(guest_client)

In [37]:
real_result

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', None, '10033', '11216',
       '10016', '10305', '10312', '10026', '10309', '10036', '11433',
       '11235', '11213', '11379', '11101', '10014', '11231', '11234',
       '10457', '10459', '10465', '11207', '10002', '10034', '11233',
       '10453', '10456', '10469', '11374', '11221', '11421', '11215',
       '10007', '10019', '11205', '11418', '11369', '11249', '10005',
       '10009', '11211', '11412', '10458', '11229', '10065', '10030',
       '11222', '10024', '10013', '11420', '11365', '10012', '11214',
       '11212', '10022', '11232', '11040', '11226', '10281', '11102',
       '11208', '10001', '10472', '11414', '11223', '10040', '11220',
       '11373', '11203', '11691', '11356', '10017', '10452', '10280',
       '11217', '10031', '11201', '11358', '10128', '11423', '10039',
       '10010', '11209'

In [38]:
node.land()