# Reading from a CSV

## Install

In [None]:
SYFT_VERSION = ">=0.9,<1.0.0"
package_string = f'"syft{SYFT_VERSION}"'
# %pip install {package_string} -q

In [None]:
# syft absolute
import syft as sy

sy.requires(SYFT_VERSION)

In [None]:
server = sy.orchestra.launch(name="pandas-test-datasite-1", port=7081, reset=True)

# Data owner: Upload data

In [None]:
root_datasite_client = server.login(email="info@openmined.org", password="changethis")

## Load data

In [None]:
# stdlib
from datetime import timedelta

# third party
from dateutil.parser import parse
import pandas as pd
from pandas._libs.tslibs.timestamps import Timestamp

# syft absolute
from syft import autocache
from syft.util.util import PANDAS_DATA

In [None]:
data_path = autocache(f"{PANDAS_DATA}/bikes.csv")

In [None]:
broken_df = pd.read_csv(data_path, encoding="ISO-8859-1")

In [None]:
broken_df

In [None]:
assert len(broken_df) == 310

In [None]:
fixed_df = pd.read_csv(
    data_path,
    sep=";",
    encoding="latin1",
    parse_dates=["Date"],
    dayfirst=True,
    index_col="Date",
)
fixed_df[:3]

In [None]:
assert len(fixed_df) == 310

In [None]:
# stdlib
from random import randint

# third party
import numpy as np


def get_mock_int(i, var=10):
    base = 100 + int(np.cos(i / 10) * 50)
    return randint(max(base - var, 1), max(1, base + var))


def get_mock_date(i):
    return str(parse("Jun 1 2005").date() + timedelta(days=i))


# some randomness
r = [(randint(1, 40), randint(1, 40)) for _ in range(7)]


def get_mock_row(i):
    res = [get_mock_int(i + r[j][0], r[j][1]) for j in range(7)]
    # add NaNs in the second and 10th col
    res.insert(1, None), res.insert(9, None)
    return res

In [None]:
mock = pd.DataFrame(
    index=[Timestamp(get_mock_date(i)) for i in range(len(fixed_df))],
    data=[get_mock_row(i) for i in range(len(fixed_df))],
    columns=fixed_df.columns,
)

Upload the data

In [None]:
dataset = sy.Dataset(
    name="test",
    asset_list=[sy.Asset(name="bikes", data=fixed_df, mock=mock, mock_is_real=False)],
)
root_datasite_client.upload_dataset(dataset)

## Create user account

In [None]:
user = root_datasite_client.register(
    name="Jane Doe",
    email="jane@caltech.edu",
    password="abc123",
    password_verify="abc123",
    institution="Caltech",
    website="https://www.caltech.edu/",
)

In [None]:
# todo: give user data scientist role

In [None]:
guest_datasite_client = server.client

In [None]:
guest_client = guest_datasite_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: request execution

## Download mock and submit a syft_function

### Get mock

In [None]:
guest_datasite_client = server.client
guest_client = guest_datasite_client.login(email="jane@caltech.edu", password="abc123")

In [None]:
ds = guest_datasite_client.datasets[0]

In [None]:
asset = ds.assets["bikes"]

In [None]:
df = asset.mock

### Selecting a column

When you read a CSV, you get a kind of object called a DataFrame, which is made up of rows and columns. You get columns out of a DataFrame the same way you get elements out of a dictionary.

Here's an example:

In [None]:
df["Berri 1"]

### Plotting a column

Just add .plot() to the end! How could it be easier? =)

We can see that, unsurprisingly, not many people are biking in January, February, and March,

In [None]:
df["Berri 1"].plot()

We can also plot all the columns just as easily. We'll make it a little bigger, too. You can see that it's more squished together, but all the bike paths behave basically the same -- if it's a bad day for cyclists, it's a bad day everywhere.

In [None]:
df.plot(figsize=(15, 10))

### Putting that all together

Here's the code we needed to write do draw that graph, all together:



In [None]:
res = df["Berri 1"].plot()

In [None]:
@sy.syft_function(
    input_policy=sy.ExactMatch(df=asset), output_policy=sy.SingleExecutionExactOutput(),
)
def get_column(df):
    return df["Berri 1"]

Create and submit project

In [None]:
new_project = sy.Project(
    name="My pandas project 1",
    description="Hi, I would like to plot the Berri 1 column.",
    members=[guest_client],
)
new_project

In [None]:
project = new_project.send()
assert isinstance(project, sy.service.project.project.Project)
project

In [None]:
project.create_code_request(get_column, guest_client)

In [None]:
assert len(guest_client.code.get_all()) == 1

In [None]:
assert len(project.events) == 1

In [None]:
assert isinstance(project.events[0], sy.service.project.project.ProjectRequest)

# Data owner: approve request

## Get notifications

In [None]:
datasite_client = server.client.login(email="info@openmined.org", password="changethis")

In [None]:
notifications = datasite_client.notifications.get_all_unread()

In [None]:
notifications

In [None]:
# syft absolute
from syft.service.project.project import Project

In [None]:
project_notification = [
    x for x in notifications if issubclass(x.linked_obj.object_type, Project)
][0]
project_notification

### Review and approve request

In [None]:
request = project_notification.link.events[0].request

In [None]:
func = request.code

In [None]:
func

In [None]:
get_col_user_function = func.run

In [None]:
real_data = datasite_client.datasets[0].assets[0].data

In [None]:
real_result = get_col_user_function(df=real_data)

In [None]:
real_result[:3]

In [None]:
result = request.approve()
assert isinstance(result, sy.SyftSuccess)

# Data scientist: compute result

In [None]:
asset = guest_client.datasets[0].assets[0]

In [None]:
guest_client.code[0].status

In [None]:
result_ptr = guest_client.code.get_column(df=asset)

In [None]:
real_result = result_ptr.get()
real_result.plot()