# String Operations- Which month was the snowiest

## Install

In [None]:
SYFT_VERSION = ">=0.8.1b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
# %pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

In [None]:
import syft as sy
sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="pandas-test-domain-6", port=9086, reset=True)

# Data owner: upload dataset

In [None]:
root_domain_client = node.login(email="info@openmined.org", password="changethis")

## Load data

By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

Here's the temperature every hour for 2012!

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [None]:
from datetime import timedelta
import random
from dateutil.parser import parse
from pandas._libs.tslibs.timestamps import Timestamp
from syft.service.project.project import Project
from syft.util.util import autocache, PANDAS_DATA

In [None]:
weather_2012_final = pd.read_csv(autocache(f"{PANDAS_DATA}/weather_2012.csv"), index_col='Date/Time', parse_dates=True)
weather_2012_final['Temp (C)'].plot(figsize=(15, 6))

## Create mock data

Instead, we upload our dataset per month as a starting point

In [None]:
weather_types = set(weather_2012_final.Weather.to_list())

In [None]:
def get_mock_date(i):
    return str(parse('Jun 1 2010') + timedelta(days=i))

def get_mock_row(i):
    res = dict()
    for k, function in mock_functions.items():
        res[k] = function()
    return res
    

In [None]:
mock_functions = {
    "Temp (C)": lambda: random.uniform(-10, 40),
    "Dew Point Temp (C)": lambda: random.uniform(-10, 10),
    "Rel Hum (%)": lambda: random.randint(50, 100),
    "Wind Spd (km/h)": lambda: random.randint(1, 30),
    'Visibility (km)': lambda: random.uniform(0.5, 40),
    'Stn Press (kPa)': lambda: random.uniform(50, 100),
    'Weather': lambda: random.choice(list(weather_types))
}

In [None]:
mock = pd.DataFrame(index=[Timestamp(get_mock_date(i)) for i in range(len(weather_2012_final))],
                    data=[get_mock_row(i) for i in range(len(weather_2012_final))],
                    columns=weather_2012_final.columns)
    


Upload the data

In [None]:
dataset = sy.Dataset(name="test", asset_list=[sy.Asset(name=f"weather", data=weather_2012_final,
                                                       mock=mock, mock_is_real=False)])
root_domain_client.upload_dataset(dataset)


In [None]:
weather_2012_final.head()

## Create user account

In [None]:
user = root_domain_client.register(name="Jane Doe", email="jane@caltech.edu",
                            password="abc123", institution="Caltech", website="https://www.caltech.edu/")
# todo: give user data scientist role
guest_domain_client = node.client
guest_client = guest_domain_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create syft function

In [None]:
import numpy as np
import pandas as pd

## Summary
By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

## Get mocks

In [None]:
ds = guest_domain_client.datasets[0]

In [None]:
asset = ds.assets[0]

In [None]:
weather_2012 = asset.mock.syft_action_data

## String Operations

You'll see that the 'Weather' column has a text description of the weather that was going on each hour. We'll assume it's snowing if the text description contains "Snow".

pandas provides vectorized string functions, to make it easy to operate on columns containing text. There are some great examples in the documentation.

In [None]:
weather_description = weather_2012['Weather']
is_snowing = weather_description.str.contains('Snow')

This gives us a binary vector, which is a bit hard to look at, so we'll plot it.



In [None]:
# Not super useful
is_snowing[:5]

In [None]:
# More useful!
is_snowing=is_snowing.astype(float)
is_snowing.plot()

## Use resampling to find the snowiest month

If we wanted the median temperature each month, we could use the resample() method like this:

In [None]:
weather_2012

In [None]:
weather_2012['Temp (C)'].resample('M').apply(np.median).plot(kind='bar')

Unsurprisingly, July and August are the warmest.

So we can think of snowiness as being a bunch of 1s and 0s instead of Trues and Falses:

In [None]:
is_snowing.astype(float)[:10]

and then use resample to find the percentage of time it was snowing each month

In [None]:
is_snowing.astype(float).resample('M').apply(np.mean)

In [None]:
is_snowing.astype(float).resample('M').apply(np.mean).plot(kind='bar')

So now we know! In 2012, December was the snowiest month. Also, this graph suggests something that I feel -- it starts snowing pretty abruptly in November, and then tapers off slowly and takes a long time to stop, with the last snow usually being in April or May.


## Plotting temperature and snowiness stats together

We can also combine these two statistics (temperature, and snowiness) into one dataframe and plot them together:



In [None]:
temperature = weather_2012['Temp (C)'].resample('M').apply(np.median)
is_snowing = weather_2012['Weather'].str.contains('Snow')
snowiness = is_snowing.astype(float).resample('M').apply(np.mean)

# Name the columns
temperature.name = "Temperature"
snowiness.name = "Snowiness"

We'll use `concat` again to combine the two statistics into a single dataframe.

In [None]:
stats = pd.concat([temperature, snowiness], axis=1)

In [None]:
stats.columns = ["temperature", "snowiness"]

## Putting it together

Now we want to request the full code execution.

Let's put all that together, to prove how easy it is. 6 lines of magical pandas!

If you want to play around, try changing sum to max, numpy.median, or any other function you like.

In [None]:
@sy.syft_function(input_policy=sy.ExactMatch(df=ds.assets[0]),
                  output_policy=sy.SingleExecutionExactOutput())
def snow_and_temperature(df):
    import pandas as pd
    import numpy as np
    weather_2012 = df
    weather_description = weather_2012['Weather']
    is_snowing = weather_description.str.contains('Snow')
    
    median_temperature = weather_2012['Temp (C)'].resample('M').apply(np.median)
    snow_frequency = is_snowing.astype(float).resample('M').apply(np.mean)
    
    temperature = weather_2012['Temp (C)'].resample('M').apply(np.median)
    is_snowing = weather_2012['Weather'].str.contains('Snow')
    snowiness = is_snowing.astype(float).resample('M').apply(np.mean)

    # Name the columns
    temperature.name = "Temperature"
    snowiness.name = "Snowiness"
    stats = pd.concat([temperature, snowiness], axis=1)
    stats.columns = ["temperature", "snowiness"]
    
    return is_snowing.astype(float), median_temperature, snow_frequency, stats

Request code execution

In [None]:
req = guest_domain_client.code.request_code_execution(snow_and_temperature)

In [None]:
submitted_code = guest_domain_client.code[0]

In [None]:
assert guest_domain_client.code.get_all()

Create and submit project

In [None]:
project_create = sy.Project(
    name="Pandas Chapter 5",
    description="Hi, I would like to get some insights about snow and temperature for 2012",
    shareholders=[guest_domain_client],
    user_email_address = "jane@caltech.edu",
    users = [guest_domain_client]
)
project_create

In [None]:
projects = project_create.start()
assert len(projects) == 1
project = projects[0]

In [None]:
project.add_request(request=req)

In [None]:
assert len(project.events) == 1

In [None]:
assert isinstance(project.events[0], sy.service.project.project.ProjectRequest)

# Data owner: execute syft_function

In [None]:
from syft import MessageStatus

In [None]:
domain_client = node.login(email="info@openmined.org", password="changethis")

# Get messages

In [None]:
messages = domain_client.api.services.messages.get_all_unread()

In [None]:
messages

In [None]:
project_message = [x for x in messages if issubclass(x.linked_obj.object_type, Project)][0]

In [None]:
request = project_message.link.events[0].request
func = request.changes[0].link
op = func.output_policy_type

In [None]:
func

In [None]:
get_col_user_function = func.unsafe_function

In [None]:
real_data = domain_client.datasets[0].assets[0].data

In [None]:
real_data

In [None]:
real_result = get_col_user_function(df=real_data)

In [None]:
result = request.accept_by_depositing_result(real_result)
result
assert isinstance(result, sy.SyftSuccess)

# Data scientist: fetch result

In [None]:
guest_client = node.login(email="jane@caltech.edu", password="abc123")

In [None]:
asset = guest_client.datasets[0].assets[0]

In [None]:
guest_client.code[0].status

In [None]:
result_ptr = guest_client.code.snow_and_temperature(df=asset)
real_result = result_ptr.get_from(guest_client)

In [None]:
is_snowing, median_temperature, snow_frequency, snow_and_temperature  = real_result[0], real_result[1], real_result[2], real_result[3]

In [None]:
is_snowing.plot()

In [None]:
median_temperature.plot(kind="bar")

In [None]:
snow_frequency.plot(kind="bar")

In [None]:
snow_and_temperature.plot(kind='bar', subplots=True, figsize=(15, 10))

In [None]:
node.land()