# Selecting data & finding the most common complaint type

## Install

In [None]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
# %pip install {package_string} -q

In [None]:
# syft absolute
import syft as sy

sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="pandas-test-domain-2", port=9082, reset=True)

# Data Owner: Upload data

In [None]:
domain_client = node.login(email="info@openmined.org", password="changethis")

In [None]:
# stdlib
import random
from random import randint

# third party
import matplotlib.pyplot as plt
import pandas as pd

# syft absolute
from syft.serde.mock import CachedFaker
from syft.service.project.project import Project
from syft.util.util import PANDAS_DATA

%matplotlib inline

In [None]:
# Make the graphs a bit prettier, and bigger
plt.style.use("ggplot")

# This is necessary to show lots of columns in pandas 0.12.
# Not necessary in pandas 0.13.
pd.set_option("display.width", 5000)
pd.set_option("display.max_columns", 60)

plt.rcParams["figure.figsize"] = (15, 5)

## Load data

We're going to use a new dataset here, to demonstrate how to deal with larger datasets. This is a subset of the of 311 service requests from NYC Open Data.

In [None]:
# because of mixed types we specify dtype to prevent any errors
complaints = pd.read_csv(
    sy.autocache(f"{PANDAS_DATA}/311-service-requests.csv"), dtype="unicode"
)

Depending on your pandas version, you might see an error like "DtypeWarning: Columns (8) have mixed types". This means that it's encountered a problem reading in our data. In this case it almost certainly means that it has columns where some of the entries are strings and some are integers.

For now we're going to ignore it and hope we don't run into a problem, but in the long run we'd need to investigate this warning.

In [None]:
complaints

## Create Mock data

Let's create the mock data for the complaint dataset.

In [None]:
fake = CachedFaker()

In [None]:
fake_functions = {
    "Unique Key": lambda x: randint(1, 1000000),
    "Location": lambda x: (fake.coordinate(), fake.coordinate()),
    "Agency": lambda x: random.choice(["NYPD", "DOHMH", "DPR"]),
    "X Coordinate (State Plane)": lambda x: randint(1, 1000000),
    "Y Coordinate (State Plane)": lambda x: randint(1, 1000000),
    "Complaint Type": lambda x: random.choice(
        ["Illegal Parking", "Noise - Street/Sidewalk", "'Animal in a Park'"]
    ),
    "Descriptor": lambda x: random.choice(
        [
            "Branch or Limb Has Fallen Down",
            "Branches Damaged",
            "Broken Fence",
            "Broken Glass",
        ]
    ),
    "School Number": lambda x: random.choice(
        [
            "B073",
            "B077",
            "B079",
            "B080-01",
            "B087",
            "B099",
            "B100",
            "B102",
            "B109",
            "B111",
        ]
    ),
    "Bridge Highway Segment": lambda x: random.choice(
        [
            "Grand Central Pkwy (Exit 1 E-W)",
            "Grand Central Pkwy (Exit 10) - 69th Rd-Jewel Ave (Exit 11)",
            "GrandCentral Pkwy/VanWyck Expwy/College Point Blvd (Exit 22 A-E)",
            "Hamilton Ave (Exit 2A) - Gowanus Expwy (I-278) (Exit 1)",
            "Harding Ave (Exit 9) - Throgs Neck Br",
        ]
    ),
}

In [None]:
fake_triggers = {
    "Street": lambda x: fake.street_name(),
    "Date": lambda x: fake.date_time(),
    "Long": lambda x: fake.coordinate(),
    "Lat": lambda x: fake.coordinate(),
    "Address": lambda x: fake.address(),
    "Name": lambda x: fake.name(),
    "City": lambda x: fake.city(),
    "Zip": lambda x: fake.zipcode(),
}

In [None]:
mock_data = {}
for col in complaints.columns:
    col_vals = complaints[col]

    if col in fake_functions:
        mock_func = fake_functions[col]
    elif len(set(complaints[col])) < 100:
        values = list(set(complaints[col]))
        mock_func = lambda x: random.choice(values)  # noqa: E731,B023
    else:
        for trigger in fake_triggers.keys():
            if trigger in col:
                mock_func = fake_triggers[trigger]
    mock_data[col] = [mock_func(None) for x in range(len(complaints))]

In [None]:
mock = pd.DataFrame(data=mock_data)

In [None]:
mock.head()

In [None]:
dataset = sy.Dataset(
    name="test",
    asset_list=[
        sy.Asset(name="complaints", data=complaints, mock=mock, mock_is_real=False)
    ],
)
domain_client.upload_dataset(dataset)

## Create data scientist

In [None]:
user = domain_client.register(
    name="Jane Doe",
    email="jane@caltech.edu",
    password="abc123",
    password_verify="abc123",
    institution="Caltech",
    website="https://www.caltech.edu/",
)

# todo: give user data scientist role

guest_domain_client = node.client

guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create syft_function

## Download mock and submit project

### Get mock

In [None]:
guest_domain_client = node.client

In [None]:
# guest_domain_client = worker.guest_client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

In [None]:
ds = guest_domain_client.datasets[0]

In [None]:
asset = ds.assets["complaints"]

In [None]:
complaints = asset.mock

### What's even in it? (the summary)


When you print a large dataframe, it will only show you the first few rows.

If you don't see this, don't panic! The default behavior for large dataframes changed between pandas 0.12 and 0.13. Previous to 0.13 it would show you a summary of the dataframe. This includes all the columns, and how many non-null values there are in each column.

In [None]:
complaints

### Selecting columns and rows

To select a column, we index with the name of the column, like this:

In [None]:
complaints["Complaint Type"]

To get the first 5 rows of a dataframe, we can use a slice: df[:5].

This is a great way to get a sense for what kind of information is in the dataframe -- take a minute to look at the contents and get a feel for this dataset.



In [None]:
complaints[:5]

We can combine these to get the first 5 rows of a column:

In [None]:
complaints["Complaint Type"][:5]

and it doesn't matter which direction we do it in:

In [None]:
complaints[:5]["Complaint Type"]

### Selecting multiple columns

What if we just want to know the complaint type and the borough, but not the rest of the information? Pandas makes it really easy to select a subset of the columns: just index with list of columns you want.

In [None]:
complaints[["Complaint Type", "Borough"]]

That showed us a summary, and then we can look at the first 10 rows:



In [None]:
complaints[["Complaint Type", "Borough"]][:10]

### What's the most common complaint type?

This is a really easy question to answer! There's a .value_counts() method that we can use:

In [None]:
complaints["Complaint Type"].value_counts()

If we just wanted the top 10 most common complaints, we can do this:

In [None]:
complaint_counts = complaints["Complaint Type"].value_counts()
complaint_counts[:10]

In [None]:
complaint_counts[:10].plot(kind="bar")

## Request real result

Now that we finished our analysis on the mock data, we can request this execution on the real data.

In [None]:
@sy.syft_function(
    input_policy=sy.ExactMatch(df=asset.pointer),
    output_policy=sy.SingleExecutionExactOutput(),
)
def get_counts(df):
    complaint_counts = df["Complaint Type"].value_counts()
    return complaint_counts[:10]

Create and submit project

In [None]:
new_project = sy.Project(
    name="Pandas chapter 2",
    description="Hi, I would like to plot the histogram of the complaint types.",
    members=[guest_client],
)
new_project

In [None]:
project = new_project.send()
assert isinstance(project, sy.service.project.project.Project)
project

In [None]:
project.create_code_request(get_counts, guest_client)

In [None]:
assert len(guest_client.code.get_all()) == 1

In [None]:
assert len(project.events) == 1

In [None]:
assert isinstance(project.events[0], sy.service.project.project.ProjectRequest)

# Data owner: execute function

In [None]:
domain_client = node.client.login(email="info@openmined.org", password="changethis")

# Get notifications

In [None]:
notifications = domain_client.notifications.get_all_unread()

In [None]:
notifications

In [None]:
project_notification = [
    x for x in notifications if issubclass(x.linked_obj.object_type, Project)
][0]

In [None]:
request = project_notification.link.events[0].request
func = request.code

In [None]:
func

In [None]:
get_counts_user_func = func.unsafe_function

In [None]:
real_data = domain_client.datasets[0].assets[0].data

In [None]:
real_result = get_counts_user_func(df=real_data)

In [None]:
real_result[:3]

In [None]:
result = request.accept_by_depositing_result(real_result)
assert isinstance(result, sy.SyftSuccess)

# Data scientist: fetch result

In [None]:
asset = guest_client.datasets[0].assets[0]

In [None]:
guest_client.code[0].status

In [None]:
result_ptr = guest_client.code.get_counts(df=asset)
real_result = result_ptr.get()
real_result.plot(kind="bar")

In [None]:
node.land()