# Reading from a CSV

## Install

In [None]:
SYFT_VERSION = ">=0.8.1b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
!pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html

In [None]:
import syft as sy
sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="pandas-test-domain-1", port="auto", reset=True)

# Data owner: Upload data

In [None]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

## Load data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
from dateutil.parser import parse
from pandas._libs.tslibs.timestamps import Timestamp
from syft import autocache
from syft.util.util import PANDAS_DATA

In [None]:
data_path = autocache(f"{PANDAS_DATA}/bikes.csv")

In [None]:
broken_df = pd.read_csv(data_path, encoding = "ISO-8859-1")

In [None]:
broken_df

In [None]:
fixed_df = pd.read_csv(data_path, sep=';', encoding='latin1', parse_dates=['Date'], dayfirst=True, index_col='Date')
fixed_df[:3]

In [None]:
import numpy as np
from random import randint

def get_mock_int(i, var=10):
    base = 100 + int(np.cos(i/10) * 50)
    return randint(max(base-var, 1),max(1, base+var))

def get_mock_date(i):
    return str(parse('Jun 1 2005').date() + timedelta(days=i))

# some randomness
r = [(randint(1, 40), randint(1, 40)) for _ in range(7)]

def get_mock_row(i):
    res = [get_mock_int(i+r[j][0], r[j][1]) for j in range(7)]
    # add NaNs in the second and 10th col
    res.insert(1, None), res.insert(9, None)
    return res
    

In [None]:
mock = pd.DataFrame(index=[Timestamp(get_mock_date(i)) for i in range(len(fixed_df))],
                    data=[get_mock_row(i) for i in range(len(fixed_df))],
                    columns=fixed_df.columns)

Upload the data

In [None]:
dataset = sy.Dataset(name="test",
                     asset_list=[sy.Asset(name="bikes", data=fixed_df, mock=mock, mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


## Create user account

In [None]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")

In [None]:
# todo: give user data scientist role

In [None]:
guest_domain_client = node.client

In [None]:
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: request execution

## Download mock and submit a syft_function

### Get mock

In [None]:
guest_domain_client = node.client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

In [None]:
ds = guest_domain_client.datasets[0]

In [None]:
asset = ds.assets["bikes"]

In [None]:
df = asset.mock

### Selecting a column

When you read a CSV, you get a kind of object called a DataFrame, which is made up of rows and columns. You get columns out of a DataFrame the same way you get elements out of a dictionary.

Here's an example:

In [None]:
df['Berri 1']

### Plotting a column

Just add .plot() to the end! How could it be easier? =)

We can see that, unsurprisingly, not many people are biking in January, February, and March,

In [None]:
df['Berri 1'].plot()

We can also plot all the columns just as easily. We'll make it a little bigger, too. You can see that it's more squished together, but all the bike paths behave basically the same -- if it's a bad day for cyclists, it's a bad day everywhere.

In [None]:
df.plot(figsize=(15, 10))

### Putting that all together

Here's the code we needed to write do draw that graph, all together:



In [None]:
res = df['Berri 1'].plot()

In [None]:
@sy.syft_function(input_policy=sy.ExactMatch(df=asset.mock),
                  output_policy=sy.SingleExecutionExactOutput())
def get_column(df):
    return df['Berri 1']

Request code execution

In [None]:
req = guest_domain_client.api.services.code.request_code_execution(get_column)


In [None]:
submitted_code = guest_domain_client.code[0]

In [None]:
assert guest_domain_client.api.services.code.get_all()

Create and submit project

In [None]:
new_project = sy.Project(name="My pandas project 2",
                         description="Hi, I would like to plot the Berri 1 column.")

In [None]:
new_project.add_request(obj=submitted_code, permission=sy.UserCodeStatus.EXECUTE)

In [None]:
guest_domain_client.submit_project(new_project)

# Data owner: execute function

## Get messages

In [None]:
domain_client = node.client.login(email="info@openmined.org", password="changethis")

In [None]:
messages = domain_client.api.services.messages.get_all_for_status(sy.MessageStatus.UNDELIVERED)

In [None]:
messages

In [None]:
from syft.service.project.project import Project

In [None]:
project_message = [x for x in messages if issubclass(x.linked_obj.object_type, Project)][0]
project_message

In [None]:
request = project_message.link.requests[0]

In [None]:
func = request.changes[0].link
op = func.output_policy_type

In [None]:
func

In [None]:
get_col_user_function = func.unsafe_function

In [None]:
real_data = domain_client.datasets[0].assets[0].data

In [None]:
real_result = get_col_user_function(df=real_data)

In [None]:
real_result[:3]

In [None]:
result = request.accept_by_depositing_result(real_result)
result
assert isinstance(result, sy.SyftSuccess)

# Data scientist: fetch result

In [None]:
asset = guest_client.datasets[0].assets[0]

In [None]:
guest_client.api.services.code[0].status

In [None]:
real_result = guest_client.api.services.code.get_column(df=asset)
real_result.plot()

In [None]:
node.land()