# Find out on which weekday people bike the most with groupby and aggregate

In [None]:
SYFT_VERSION = ">=0.8.1b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
!pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

In [None]:
import syft as sy
sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="pandas-test-domain-4",port="8074", reset=True)

# Data owner: upload data

In [None]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

## Load data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
from dateutil.parser import parse
from pandas._libs.tslibs.timestamps import Timestamp
from syft.service.project.project import Project
from syft.util.util import autocache, PANDAS_DATA

In [None]:
df = pd.read_csv(autocache(f"{PANDAS_DATA}/bikes.csv"), sep=';', encoding='latin1', parse_dates=['Date'], dayfirst=True, index_col='Date')
df[:3]

## Create mock data

In [None]:
import numpy as np
from random import randint

def get_mock_int(i, var=10):
    base = 100 + int(np.cos(i/10) * 50)
    return randint(max(base-var, 1),max(1, base+var))

def get_mock_date(i):
    return str(parse('Jun 1 2005').date() + timedelta(days=i))

# some randomness
r = [(randint(1, 40), randint(1, 40)) for _ in range(7)]

def get_mock_row(i):
    res = [get_mock_int(i+r[j][0], r[j][1]) for j in range(7)]
    # add NaNs in the second and 10th col
    res.insert(1, None), res.insert(9, None)
    return res
    

In [None]:
mock = pd.DataFrame(index=[Timestamp(get_mock_date(i)) for i in range(len(df))],
                    data=[get_mock_row(i) for i in range(len(df))],
                    columns=df.columns)

Upload the data

In [None]:
dataset = sy.Dataset(name="bikes2",
                     asset_list=[sy.Asset(name="bikes", data=df, mock=mock, mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


## create Data scientist

In [None]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")

# todo: give user data scientist role

guest_domain_client = node.client

guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data Scientist: create result pointer

## Download mock and submit project

### Get mock

In [None]:
ds = guest_domain_client.datasets[0]

In [None]:
asset = ds.assets["bikes"]

In [None]:
asset

In [None]:
bikes_pointer = asset.mock

### Adding a 'weekday' column to our dataframe

First, we need to load up the data. We've done this before.

In [None]:
bikes_pointer['Berri 1'].plot()

Next up, we're just going to look at the Berri bike path. Berri is a street in Montreal, with a pretty important bike path. I use it mostly on my way to the library now, but I used to take it to work sometimes when I worked in Old Montreal.

So we're going to create a dataframe with just the Berri bikepath in it

In [None]:
berri_bikes_pointer = bikes_pointer[['Berri 1']].copy()

In [None]:
berri_bikes_pointer[:5]

Next, we need to add a 'weekday' column. Firstly, we can get the weekday from the index. We haven't talked about indexes yet, but the index is what's on the left on the above dataframe, under 'Date'. It's basically all the days of the year.

In [None]:
berri_bikes_pointer.index

You can see that actually some of the days are missing -- only 310 days of the year are actually there. Who knows why.

Pandas has a bunch of really great time series functionality, so if we wanted to get the day of the month for each row, we could do it like this:

In [None]:
berri_bikes_pointer.index.day

We actually want the weekday, though:

In [None]:
berri_bikes_pointer.index.weekday

These are the days of the week, where 0 is Monday. I found out that 0 was Monday by checking on a calendar.

Now that we know how to get the weekday, we can add it as a column in our dataframe like this:

In [None]:
berri_bikes_pointer

In [None]:
berri_bikes_pointer['weekday'] = berri_bikes_pointer.index.weekday

In [None]:
# berri_bikes_pointer.loc[:,'weekday'] = berri_bikes_pointer.index.weekday
berri_bikes_pointer[:5]

### Adding up the cyclists by weekday

This turns out to be really easy!

Dataframes have a `.groupby()` method that is similar to SQL groupby, if you're familiar with that. I'm not going to explain more about it right now -- if you want to to know more, the [documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html) is really good.

In this case, `berri_bikes.groupby('weekday').aggregate(sum)` means "Group the rows by weekday and then add up all the values with the same weekday".

In [None]:
# res = berri_bikes_pointer.groupby('weekday')

In [None]:
weekday_counts_pointer = berri_bikes_pointer.groupby('weekday').aggregate(sum)
weekday_counts_pointer

It's hard to remember what 0, 1, 2, 3, 4, 5, 6 mean, so we can fix it up and graph it:

In [None]:
# weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# weekday_counts

In [None]:
weekday_counts_pointer.plot(kind='bar')

So it looks like Montrealers are commuter cyclists -- they bike much more during the week. Neat!

### Putting it together

Now we want to request the real result.

In [None]:
weekday_counts_pointer.request(guest_client)

# Data owner: approve result

In [None]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

In [None]:
root_domain_client.api.services.request[0].approve()

# Data scientist: fetch result

In [None]:
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")
real_result = weekday_counts_pointer.get_from(guest_client)
real_result

In [None]:
real_result.plot(kind='bar')