# Data owner: upload dataset

## Install

In [1]:
#!pip install syft

In [2]:
from syft import Worker
import syft as sy
from syft.core.node.new.util import autocache, PANDAS_DATA
worker = Worker.named("pandas-test-domain-8", processes=1, reset=True)
root_domain_client = worker.root_client



SQLite Store Path:
!open file:///var/folders/q1/ryq93kwj055dlbpngxv1c7z40000gn/T/7bca415d13ed1ec841f0d0aede098dbb.sqlite

> Starting Worker: test-domain-1 - 7bca415d13ed1ec841f0d0aede098dbb - NodeType.DOMAIN - [<class 'syft.core.node.new.user_service.UserService'>, <class 'syft.core.node.new.metadata_service.MetadataService'>, <class 'syft.core.node.new.action_service.ActionService'>, <class 'syft.core.node.new.test_service.TestService'>, <class 'syft.core.node.new.dataset_service.DatasetService'>, <class 'syft.core.node.new.user_code_service.UserCodeService'>, <class 'syft.core.node.new.request_service.RequestService'>, <class 'syft.core.node.new.data_subject_service.DataSubjectService'>, <class 'syft.core.node.new.network_service.NetworkService'>, <class 'syft.core.node.new.policy_service.PolicyService'>, <class 'syft.core.node.new.message_service.MessageService'>, <class 'syft.core.node.new.project_service.ProjectService'>, <class 'syft.core.node.new.data_subject_member_service.Data

## Load data

It's not obvious how to deal with Unix timestamps in pandas -- it took me quite a while to figure this out. The file we're using here is a popularity-contest file I found on my system at /var/log/popularity-contest.

In [3]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [4]:
popcon = pd.read_csv(autocache(f"{PANDAS_DATA}/popularity-contest"), sep=' ', )[:-1]
popcon.columns = ['atime', 'ctime', 'package-name', 'mru-program', 'tag']

In [5]:
set(popcon["tag"])

{'<OLD>', '<RECENT-CTIME>', nan}

## Create mock data

Lets create a mock dataset

In [6]:
# weather_types = set(weather_2012_final.Weather.to_list())

In [7]:
from random import randint
import random

In [8]:
def get_random_timestamp():
    return "135" + "".join([str(randint(0,9)) for i in range(6)])

In [9]:
def get_mock_date(i):
    return str(parse('Jun 1 2010') + timedelta(days=i))

def get_mock_row(i):
    res = dict()
    for k, function in mock_functions.items():
        res[k] = function()
    return res
    

In [10]:
mock_functions = {'atime': lambda: get_random_timestamp(),
                  'ctime': lambda: get_random_timestamp(),
                  'package-name': lambda: random.choice([
                      'libghc-stm-dev',
                      'libqtdee-dev',
                      'geoclue-ubuntu-geoip',
                      'libdesktop-agnostic0',
                      'ubuntu-extras-keyring',
                      'libbsd0',
                      'libxres-dev']),
                  'mru-program': lambda: random.choice([
                      '/usr/bin/opam',
                      '/usr/bin/onboard',
                      '/lib/init/upstart-job',
                      '/usr/bin/page',
                  ]),
                  'tag': lambda: random.choice(['<OLD>', '<RECENT-CTIME>', 'nan'])}


In [11]:
mock = pd.DataFrame(data=[get_mock_row(i) for i in range(len(popcon))],
                    columns=popcon.columns)
    


Upload the data

In [12]:
dataset = sy.Dataset(name="test", asset_list=[sy.Asset(name=f"weather", data=popcon,
                                                       mock=mock, mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


  0%|                                                                                                                 | 0/1 [00:00<?, ?it/s]

Uploading: weather


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.59it/s]


In [13]:
popcon.head()

Unnamed: 0,atime,ctime,package-name,mru-program,tag
0,1387295797,1367633260,perl-base,/usr/bin/perl,
1,1387295796,1354370480,login,/bin/su,
2,1387295743,1354341275,libtalloc2,/usr/lib/x86_64-linux-gnu/libtalloc.so.2.0.7,
3,1387295743,1387224204,libwbclient0,/usr/lib/x86_64-linux-gnu/libwbclient.so.0,<RECENT-CTIME>
4,1387295742,1354341253,libselinux1,/lib/x86_64-linux-gnu/libselinux.so.1,


## Create user account

In [14]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")

In [15]:
# todo: give user data scientist role

In [16]:
guest_domain_client = worker.guest_client

In [17]:
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create syft_function

In [18]:
import numpy as np
import pandas as pd

## Summary

It's not obvious how to deal with Unix timestamps in pandas -- it took me quite a while to figure this out. The file we're using here is a popularity-contest file I found on my system at `/var/log/popularity-contest`.

## Get mocks

In [19]:
guest_domain_client = worker.guest_client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

In [20]:
ds = guest_domain_client.datasets[0]

In [21]:
asset = ds.assets[0]

In [22]:
popcon = asset.mock.syft_action_data

## Parsing Unix timestamps

The colums are the access time, created time, package name, recently used program, and a tag



In [23]:
popcon[:5]

Unnamed: 0,atime,ctime,package-name,mru-program,tag
0,135105421,135875263,geoclue-ubuntu-geoip,/usr/bin/page,<RECENT-CTIME>
1,135307893,135973810,libxres-dev,/lib/init/upstart-job,
2,135821470,135821245,libqtdee-dev,/lib/init/upstart-job,
3,135747832,135951540,libqtdee-dev,/lib/init/upstart-job,<RECENT-CTIME>
4,135170173,135232410,libghc-stm-dev,/usr/bin/onboard,<RECENT-CTIME>


The magical part about parsing timestamps in pandas is that numpy datetimes are already stored as Unix timestamps. So all we need to do is tell pandas that these integers are actually datetimes -- it doesn't need to do any conversion at all.

We need to convert these to ints to start:

In [24]:
popcon['atime'] = popcon['atime'].astype(int)
popcon['ctime'] = popcon['ctime'].astype(int)

Every numpy array and pandas series has a dtype -- this is usually `int64`, `float64`, or `object`. Some of the time types available are `datetime64[s]`, `datetime64[ms]`, and `datetime64[us]`. There are also `timedelta` types, similarly.

We can use the `pd.to_datetime` function to convert our integer timestamps into datetimes. This is a constant-time operation -- we're not actually changing any of the data, just how pandas thinks about it.

In [25]:
popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')
popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s')

If we look at the dtype now, it's `<M8[ns]`. As far as I can tell `M8` is secret code for `datetime64`.

In [26]:
popcon['atime'].dtype

dtype('<M8[ns]')

So now we can look at our `atime` and `ctime` as dates!

In [27]:
popcon[:5]

Unnamed: 0,atime,ctime,package-name,mru-program,tag
0,1974-04-13 17:17:01,1974-04-22 15:07:43,geoclue-ubuntu-geoip,/usr/bin/page,<RECENT-CTIME>
1,1974-04-16 01:31:33,1974-04-23 18:30:10,libxres-dev,/lib/init/upstart-job,
2,1974-04-22 00:11:10,1974-04-22 00:07:25,libqtdee-dev,/lib/init/upstart-job,
3,1974-04-21 03:43:52,1974-04-23 12:19:00,libqtdee-dev,/lib/init/upstart-job,<RECENT-CTIME>
4,1974-04-14 11:16:13,1974-04-15 04:33:30,libghc-stm-dev,/usr/bin/onboard,<RECENT-CTIME>


Now suppose we want to look at all packages that aren't libraries.

First, I want to get rid of everything with timestamp 0. Notice how we can just use a string in this comparison, even though it's actually a timestamp on the inside? That is because pandas is amazing.


In [28]:
popcon = popcon[popcon['atime'] > '1970-01-01']

Now we can use pandas' magical string abilities to just look at rows where the package name doesn't contain 'lib'.

In [29]:
nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]

In [30]:
nonlibraries.sort_values('ctime', ascending=False)[:10]

Unnamed: 0,atime,ctime,package-name,mru-program,tag
1047,1974-04-17 00:44:51,1974-04-24 01:45:30,geoclue-ubuntu-geoip,/usr/bin/onboard,<RECENT-CTIME>
2724,1974-04-20 00:34:15,1974-04-24 01:38:43,geoclue-ubuntu-geoip,/lib/init/upstart-job,
1128,1974-04-19 09:13:28,1974-04-24 01:35:39,ubuntu-extras-keyring,/lib/init/upstart-job,<RECENT-CTIME>
1307,1974-04-16 00:01:06,1974-04-24 00:43:35,geoclue-ubuntu-geoip,/usr/bin/onboard,<RECENT-CTIME>
459,1974-04-17 23:38:49,1974-04-24 00:08:44,ubuntu-extras-keyring,/lib/init/upstart-job,<OLD>
1314,1974-04-14 20:45:25,1974-04-23 23:11:22,ubuntu-extras-keyring,/usr/bin/opam,<RECENT-CTIME>
225,1974-04-14 18:26:18,1974-04-23 23:09:13,ubuntu-extras-keyring,/usr/bin/opam,
558,1974-04-23 21:11:13,1974-04-23 22:38:45,geoclue-ubuntu-geoip,/usr/bin/opam,
2637,1974-04-12 20:46:41,1974-04-23 22:16:21,ubuntu-extras-keyring,/lib/init/upstart-job,<RECENT-CTIME>
1593,1974-04-17 09:24:01,1974-04-23 21:25:44,geoclue-ubuntu-geoip,/usr/bin/opam,<RECENT-CTIME>


Okay, cool, it says that I I installed ddd recently. And postgresql! I remember installing those things. Neat.

The whole message here is that if you have a timestamp in seconds or milliseconds or nanoseconds, then you can just "cast" it to a `'datetime64[the-right-thing]'` and pandas/numpy will take care of the rest.

## Putting it together

Now we want to request the full code execution.

Let's put all that together, to prove how easy it is. 6 lines of magical pandas!

In [31]:
@sy.syft_function(input_policy=sy.ExactMatch(popcon=ds.assets[0]),
                  output_policy=sy.SingleExecutionExactOutput())
def find_recently_installed(popcon):
    import pandas as pd
    popcon['atime'] = popcon['atime'].astype(int)
    popcon['ctime'] = popcon['ctime'].astype(int)
    popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')
    popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s')
    popcon = popcon[popcon['atime'] > '1970-01-01']
    nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]
    return nonlibraries.sort_values('ctime', ascending=False)[:10]

Request code execution

In [32]:
req = guest_domain_client.api.services.code.request_code_execution(find_recently_installed)

In [33]:
submitted_code = guest_domain_client.code[0]

In [34]:
assert guest_domain_client.api.services.code.get_all()

Create and submit project

In [35]:
new_project = sy.Project(name="Pandas Chapter 8",
                         description="Hi, I would like to get some insights about the installed programs")

In [36]:
new_project.add_request(obj=submitted_code, permission=sy.UserCodeStatus.EXECUTE)

In [37]:
guest_domain_client.submit_project(new_project)

# Data owner: execute syft_function

In [38]:
from syft import Worker, MessageStatus
from syft.core.node.new.project import Project

In [42]:
domain_client = worker.guest_client.login(email="info@openmined.org", password="changethis")

# Get messages

In [45]:
messages = domain_client.api.services.messages.get_all_for_status(MessageStatus.UNDELIVERED)

In [46]:
messages

Unnamed: 0,type,id,subject,status,created_at,linked_obj
0,syft.core.node.new.messages.Message,1ae557171bef443fb7ed0808f0c0d9b6,Approval Request,MessageStatus.UNDELIVERED,2023-05-15 13:03:10,<<class 'syft.core.node.new.request.Request'>:...
1,syft.core.node.new.messages.Message,aeecbc5a8c384d4087c4a85ce3970f93,Project Approval,MessageStatus.UNDELIVERED,2023-05-15 13:03:13,<<class 'syft.core.node.new.project.Project'>:...


In [47]:
project_message = [x for x in messages if issubclass(x.linked_obj.object_type, Project)][0]

In [48]:
request = project_message.link.requests[0]
func = request.changes[0].link
op = func.output_policy_type

In [49]:
func

```python
class UserCode:
  id: str = 826dbb1485b7483193e08b296be1c214
  node_uid: str = 7bca415d13ed1ec841f0d0aede098dbb
  user_verify_key: str = 34f0420468ec17cd6dc6c9aa95ba96b0552344becc7b8b856955bee85f44ab9e
  raw_code: str = "@sy.syft_function(input_policy=sy.ExactMatch(popcon=ds.assets[0]),
                  output_policy=sy.SingleExecutionExactOutput())
def find_recently_installed(popcon):
    import pandas as pd
    popcon['atime'] = popcon['atime'].astype(int)
    popcon['ctime'] = popcon['ctime'].astype(int)
    popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')
    popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s')
    popcon = popcon[popcon['atime'] > '1970-01-01']
    nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]
    return nonlibraries.sort_values('ctime', ascending=False)[:10]
"
  input_policy_type: str = <class 'syft.core.node.new.policy.ExactMatch'>
  input_policy_init_kwargs: str = {NodeView(node_name='test-domain-1', verify_key=aec6ea4dfc049ceacaeeebc493167a88a200ddc367b1fa32da652444b635d21f): {'popcon': <UID: 94036f7797794ce4a77c57446f20cd37>}}
  input_policy_state: str = b''
  output_policy_type: str = <class 'syft.core.node.new.policy.OutputPolicyExecuteOnce'>
  output_policy_init_kwargs: str = {}
  output_policy_state: str = b''
  parsed_code: str = "

def user_func_find_recently_installed_34f0420468ec17cd6dc6c9aa95ba96b0552344becc7b8b856955bee85f44ab9e_76996a89952a0cd5f53e2e6da4ba9f2787211c4990d0fb1f69c5224b7af1b38c(popcon):

    def find_recently_installed(popcon):
        import pandas as pd
        popcon['atime'] = popcon['atime'].astype(int)
        popcon['ctime'] = popcon['ctime'].astype(int)
        popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')
        popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s')
        popcon = popcon[(popcon['atime'] > '1970-01-01')]
        nonlibraries = popcon[(~ popcon['package-name'].str.contains('lib'))]
        return nonlibraries.sort_values('ctime', ascending=False)[:10]
    result = find_recently_installed(popcon=popcon)
    return result
"
  service_func_name: str = "find_recently_installed"
  unique_func_name: str = "user_func_find_recently_installed_34f0420468ec17cd6dc6c9aa95ba96b0552344becc7b8b856955bee85f44ab9e_76996a89952a0cd5f53e2e6da4ba9f2787211c4990d0fb1f69c5224b7af1b38c"
  user_unique_func_name: str = "user_func_find_recently_installed_34f0420468ec17cd6dc6c9aa95ba96b0552344becc7b8b856955bee85f44ab9e"
  code_hash: str = "76996a89952a0cd5f53e2e6da4ba9f2787211c4990d0fb1f69c5224b7af1b38c"
  signature: str = (popcon)
  status: str = {NodeView(node_name='test-domain-1', verify_key=aec6ea4dfc049ceacaeeebc493167a88a200ddc367b1fa32da652444b635d21f): <UserCodeStatus.SUBMITTED: 'submitted'>}
  input_kwargs: str = ['popcon']
  enclave_metadata: str = None

```

In [50]:
find_recently_installed = func.unsafe_function



In [57]:
real_data = domain_client.datasets[0].assets[0].data.syft_action_data

In [58]:
real_data

Unnamed: 0,atime,ctime,package-name,mru-program,tag
0,1387295797,1367633260,perl-base,/usr/bin/perl,
1,1387295796,1354370480,login,/bin/su,
2,1387295743,1354341275,libtalloc2,/usr/lib/x86_64-linux-gnu/libtalloc.so.2.0.7,
3,1387295743,1387224204,libwbclient0,/usr/lib/x86_64-linux-gnu/libwbclient.so.0,<RECENT-CTIME>
4,1387295742,1354341253,libselinux1,/lib/x86_64-linux-gnu/libselinux.so.1,
...,...,...,...,...,...
2892,0,0,libreadline-dev,<NOFILES>,
2893,0,0,notify-osd-icons,<NOFILES>,
2894,0,0,python-apt-common,<NOFILES>,
2895,0,0,libindicator-messages-status-provider1,<NOFILES>,


In [59]:
real_result = find_recently_installed(popcon=real_data)

In [60]:
result = request.accept_by_depositing_result(real_result)
result
assert isinstance(result, sy.SyftSuccess)

# Data Owner: fetch result

In [66]:
guest_client.api.services.code[0].status

{NodeView(node_name='test-domain-1', verify_key=aec6ea4dfc049ceacaeeebc493167a88a200ddc367b1fa32da652444b635d21f): <UserCodeStatus.EXECUTE: 'execute'>}

In [67]:
asset = guest_client.datasets[0].assets[0]

In [68]:
real_result = guest_client.api.services.code.find_recently_installed(popcon=asset)

In [69]:
real_result

Unnamed: 0,atime,ctime,package-name,mru-program,tag
57,2013-12-17 04:55:39,2013-12-17 04:55:42,ddd,/usr/bin/ddd,<RECENT-CTIME>
450,2013-12-16 20:03:20,2013-12-16 20:05:13,nodejs,/usr/bin/npm,<RECENT-CTIME>
454,2013-12-16 20:03:20,2013-12-16 20:05:04,switchboard-plug-keyboard,/usr/lib/plugs/pantheon/keyboard/options.txt,<RECENT-CTIME>
445,2013-12-16 20:03:20,2013-12-16 20:05:04,thunderbird-locale-en,/usr/lib/thunderbird-addons/extensions/langpac...,<RECENT-CTIME>
396,2013-12-16 20:08:27,2013-12-16 20:05:03,software-center,/usr/sbin/update-software-center,<RECENT-CTIME>
449,2013-12-16 20:03:20,2013-12-16 20:05:00,samba-common-bin,/usr/bin/net.samba3,<RECENT-CTIME>
397,2013-12-16 20:08:25,2013-12-16 20:04:59,postgresql-client-9.1,/usr/lib/postgresql/9.1/bin/psql,<RECENT-CTIME>
398,2013-12-16 20:08:23,2013-12-16 20:04:58,postgresql-9.1,/usr/lib/postgresql/9.1/bin/postmaster,<RECENT-CTIME>
452,2013-12-16 20:03:20,2013-12-16 20:04:55,php5-dev,/usr/include/php5/main/snprintf.h,<RECENT-CTIME>
440,2013-12-16 20:03:20,2013-12-16 20:04:54,php-pear,/usr/share/php/XML/Util.php,<RECENT-CTIME>
