# Cleaning up messy data

## Install

In [None]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
# %pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

In [None]:
import syft as sy
sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="pandas-test-domain-7", port=9087, reset=True)

# Data owner: Upload data

In [None]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

## Load data

In [None]:
# The usual preamble
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from syft.service.project.project import Project
from syft.util.util import autocache, PANDAS_DATA

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

One of the main problems with messy data is: how do you know if it's messy or not?

We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.

## Create mock data

In [None]:
service_requests = pd.read_csv(autocache(f"{PANDAS_DATA}/311-service-requests.csv"), dtype='unicode')

In [None]:
assert len(service_requests) == 111069

In [None]:
service_requests.head(100)

In [None]:
rows_with_dashes = service_requests['Incident Zip'].str.contains('-').fillna(False)
service_requests[rows_with_dashes]

In [None]:
import random
def get_unique_key():
    return random.randint(0,1000000)
    
def get_mock_location():
    return random.uniform(-90, 90)

def get_zip_code():
    zip = random.randint(10000,11000)
    if zip > 10990:
        zip = str(zip) + '-1234'
    return str(zip)

def get_mock_row(i):
    res = dict()
    for k, function in mock_functions.items():
        res[k] = function()
    return res
    

In [None]:
service_requests.columns

In [None]:
# make mock as close to the original data as possible!!
# TODO: Make it the same as the OG dataframe
mock_functions = {
    "Unique Key": lambda: get_unique_key(),
    'Longitude': lambda: random.uniform(-90, 90),
    'Latitude': lambda: random.uniform(-90, 90),
    'Incident Zip': lambda: get_zip_code(),
    'City': lambda: random.choice(["BROOKLYN", "NEW YORK", "BRONX"])
}

In [None]:
mock = pd.DataFrame(data=[get_mock_row(i) for i in range(len(service_requests))],
                    columns=service_requests.columns)
    


In [None]:
mock

Upload the data

In [None]:
dataset = sy.Dataset(name="test", asset_list=[sy.Asset(name=f"service_requests", data=service_requests,
                                                       mock=mock, mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


## Create user account

In [None]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")
# todo: give user data scientist role
guest_domain_client = node.client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create syft_function

In [None]:
import numpy as np
import pandas as pd

## Summary
By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

## Get mocks

In [None]:
ds = guest_domain_client.datasets[0]

In [None]:
asset = ds.assets[0]

In [None]:
requests = asset.mock

## How do we know if it's messy?
We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.

To get a sense for whether a column has problems, I usually use `.unique()` to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.

When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.

Some of the problems:

- Some have been parsed as strings, and some as floats
- There are `nans`
- Some of the zip codes are `29616-0759` or `83`
- There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE'

What we can do:

- Normalize 'N/A' and 'NO CLUE' into regular nan values
- Look at what's up with the 83, and decide what to do
- Make everything strings

In [None]:
requests['Incident Zip'].unique()

## Fixing the nan values and string/float confusion
We can pass a na_values option to pd.read_csv to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.



In [None]:
na_values = ['NO CLUE', 'N/A', '0']
requests.replace(na_values, np.NaN);

In [None]:
requests['Incident Zip'].unique()

## What's up with the dashes?

In [None]:
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])

In [None]:
requests[rows_with_dashes]

I thought these were missing data and originally deleted them like this:

`requests['Incident Zip'][rows_with_dashes] = np.nan`

But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.

In [None]:
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()

Those all look okay to truncate to me.

In [None]:
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

Done.

Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that.

In [None]:
requests[requests['Incident Zip'] == '00000'] 

This looks bad to me. Let's set these to nan.

In [None]:
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan

Great. Let's see where we are now:

In [None]:
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips

Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.

Let's take a closer look:

In [None]:
zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()

In [None]:
zips[is_far]

Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.



In [None]:
requests['City'].str.upper().value_counts()

It looks like these are legitimate complaints, so we'll just leave them alone.

## Putting it together

Now we want to request the full code execution.

Let's put all that together, to prove how easy it is. 6 lines of magical pandas!

If you want to play around, try changing sum to max, numpy.median, or any other function you like.

In [None]:
@sy.syft_function(input_policy=sy.ExactMatch(df=ds.assets[0]),
                  output_policy=sy.SingleExecutionExactOutput())
def zip_codes(df):
    import pandas as pd
    import numpy as np
    na_values = ['NO CLUE', 'N/A', '0']
    def fix_zip_codes(zips):
        # Truncate everything to length 5 
        zips = zips.str.slice(0, 5)

        # Set 00000 zip codes to nan
        zero_zips = zips == '00000'
        zips[zero_zips] = np.nan

        return zips
    df['Incident Zip'] = fix_zip_codes(df['Incident Zip'])
    result = df['Incident Zip'].unique()
    # todo, we are adding list(result) here to fix serialization errors
    return list(result)

Create and submit project

In [None]:
new_project = sy.Project(
    name="Pandas Chapter 7",
    description="Hi, I would like to get some insights about the zip codes of the complaints",
    members=[guest_domain_client],
)
new_project

In [None]:
project = new_project.start()
assert isinstance(project, sy.service.project.project.Project)
project

In [None]:
project.create_code_request(zip_codes, guest_domain_client)

In [None]:
assert len(guest_domain_client.code.get_all())==1

In [None]:
assert len(project.events) == 1

In [None]:
assert isinstance(project.events[0], sy.service.project.project.ProjectRequest)

# Data owner: execute syft_function

In [None]:
from syft import NotificationStatus

In [None]:
domain_client = node.login(email="info@openmined.org", password="changethis")

# Get notifications

In [None]:
notifications = domain_client.notifications.get_all_unread()

In [None]:
notifications

In [None]:
project_notification = [x for x in notifications if issubclass(x.linked_obj.object_type, Project)][0]

In [None]:
request = project_notification.link.events[0].request
func = request.changes[0].link
op = func.output_policy_type

In [None]:
func

In [None]:
zip_codes = func.unsafe_function

In [None]:
real_data = domain_client.datasets[0].assets[0].data

In [None]:
real_data

In [None]:
real_result = zip_codes(df=real_data)

In [None]:
real_result

In [None]:
result = request.accept_by_depositing_result(real_result)
result
assert isinstance(result, sy.SyftSuccess)

In [None]:
result

# Data scientist: fetch result

In [None]:
asset = guest_client.datasets[0].assets[0]

In [None]:
guest_client.code[0].status

In [None]:
result_ptr = guest_client.code.zip_codes(df=asset)
real_result = result_ptr.get()

In [None]:
real_result

In [None]:
node.land()