# Combining dataframes and scraping Canadian weather data

## Install

In [None]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
# %pip install {package_string} -q

In [None]:
# syft absolute
import syft as sy

sy.requires(SYFT_VERSION)

In [None]:
server = sy.orchestra.launch(name="pandas-test-datasite-5", port=9085, reset=True)

# Data owner: upload dataset

In [None]:
root_datasite_client = server.login(email="info@openmined.org", password="changethis")

## Load data

By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

Here's the temperature every hour for 2012!

In [None]:
%matplotlib inline
# third party
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (15, 3)
plt.rcParams["font.family"] = "sans-serif"

In [None]:
# stdlib
from datetime import timedelta
import random

# third party
from dateutil.parser import parse
from pandas._libs.tslibs.timestamps import Timestamp

# syft absolute
from syft.service.project.project import Project
from syft.util.util import PANDAS_DATA
from syft.util.util import autocache

In [None]:
weather_2012_final = pd.read_csv(
    autocache(f"{PANDAS_DATA}/weather_2012.csv"), index_col="Date/Time",
)

In [None]:
assert len(weather_2012_final) == 8784

In [None]:
weather_2012_final["Temp (C)"].plot(figsize=(15, 6))

## Create mock data

Instead, we upload our dataset per month as a starting point

In [None]:
url_template = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit"

In [None]:
weather_types = set(weather_2012_final.Weather.to_list())

In [None]:
def get_mock_date(i):
    return str(parse("Jun 1 2010") + timedelta(days=i))


def get_mock_row(i):
    res = {}
    for k, function in mock_functions.items():
        res[k] = function()
    return res

In [None]:
mock_functions = {
    "Temp (°C)": lambda: random.uniform(-10, 40),
    "Dew Point Temp (C)": lambda: random.uniform(-10, 10),
    "Rel Hum (%)": lambda: random.randint(50, 100),
    "Wind Spd (km/h)": lambda: random.randint(1, 30),
    "Visibility (km)": lambda: random.uniform(0.5, 40),
    "Stn Press (kPa)": lambda: random.uniform(50, 100),
    "Weather": lambda: random.choice(list(weather_types)),
}

In [None]:
assets = []
for month in range(1, 13):
    url = url_template.format(month=month, year=2012)
    weather = pd.read_csv(url, parse_dates=True, index_col="Date/Time (LST)")
    mock = pd.DataFrame(
        index=[Timestamp(get_mock_date(i)) for i in range(len(weather))],
        data=[get_mock_row(i) for i in range(len(weather))],
        columns=weather.columns,
    )

    assets.append(
        sy.Asset(name=f"weather{month}", data=weather, mock=mock, mock_is_real=False),
    )

Upload the data

In [None]:
dataset = sy.Dataset(name="test", asset_list=assets)
root_datasite_client.upload_dataset(dataset)

In [None]:
weather.head()

## Create user account

In [None]:
user = root_datasite_client.register(
    name="Jane Doe",
    email="jane@caltech.edu",
    password="abc123",
    password_verify="abc123",
    institution="Caltech",
    website="https://www.caltech.edu/",
)

# todo: give user data scientist role

guest_datasite_client = server.client

guest_client = guest_datasite_client.login(email="jane@caltech.edu", password="abc123")

# Data scientist: create syft_function

## Summary
By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV.

We'll do this by downloading it one month at a time, and then combining all the months together.

## Get mocks

In [None]:
ds = guest_datasite_client.datasets[-1]

In [None]:
ds.assets

## Downloading one month of weather data

When playing with the cycling data, I wanted temperature and precipitation data to find out if people like biking when it's raining. So I went to the site for Canadian historical weather data, and figured out how to get it automatically.

Here we're going to get the data for March 2012, and clean it up

In [None]:
url_template = "weather{month}"

To get the data for March 2013, we need to format it with `month=3`, `year=2012.`


In [None]:
url = url_template.format(month=3, year=2012)

In [None]:
asset = ds.assets[url]
weather_mar2012 = asset.mock

This is super great! We can just use the this mock directly, and just give it a URL as a filename. Awesome.
Here's the resulting dataframe.

In [None]:
weather_mar2012

In [None]:
weather_mar2012["Temp (°C)"].plot(figsize=(15, 5))

In [None]:
# weather_mar2012.columns = [
#     u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)',
#     u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag',
#     u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag',
#     u'Wind Spd (km/h)', u'Wind Spd Flag', u'Visibility (km)', u'Visibility Flag',
#     u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill',
#     u'Wind Chill Flag', u'Weather']

In [None]:
# weather[u"Temp (C)"].plot(figsize=(15, 5))

You'll notice in the summary above that there are a few columns which are are either entirely empty or only have a few values in them. Let's get rid of all of those with dropna.

The argument axis=1 to dropna means "drop columns", not rows", and how='any' means "drop the column if any value is null".

This is much better now -- we only have columns with real data.

In [None]:
weather_mar2012 = weather_mar2012.dropna(axis=1, how="any")
weather_mar2012[:5]

The Year/Month/Day/Time columns are redundant, though, and the Data Quality column doesn't look too useful. Let's get rid of those.

The axis=1 argument means "Drop columns", like before. The default for operations like dropna and drop is always to operate on rows.

In [None]:
# weather_mar2012 = weather_mar2012.drop(['Year', 'Month', 'Day', 'Time', 'Data Quality'], axis=1)
# weather_mar2012[:5]

Awesome! We now only have the relevant columns, and it's much more manageable.



## Plotting the temperature by hour of day

This one's just for fun -- we've already done this before, using groupby and aggregate! We will learn whether or not it gets colder at night. Well, obviously. But let's do it anyway.

In [None]:
temperatures = weather_mar2012[["Temp (°C)"]].copy()
print(temperatures.head)
temperatures.loc[:, "Hour"] = weather_mar2012.index.hour
temperatures.groupby("Hour").aggregate(np.median).plot()

So it looks like the time with the highest median temperature is 2pm. Neat.

## Getting the whole year of data


Okay, so what if we want the data for the whole year? Ideally the API would just let us download that, but I couldn't figure out a way to do that.

First, let's put our work from above into a function that gets the weather for a given month.

I noticed that there's an irritating bug where when I ask for January, it gives me data for the previous year, so we'll fix that too. [no, really. You can check =)]

In [None]:
def download_weather_month(year=2012, month=1):
    url = url_template.format(year=year, month=month)
    asset = ds.assets[url]
    weather_data = asset.mock
    return weather_data.dropna(axis=1)

We can test that this function does the right thing:



In [None]:
download_weather_month(2012, 1)[:5]

Now we can get all the months at once. This will take a little while to run.



In [None]:
data_by_month = [download_weather_month(2012, i) for i in range(1, 13)]

Once we have this, it's easy to concatenate all the dataframes together into one big dataframe using pd.concat. And now we have the whole year's data!

In [None]:
data_by_month[0]

In [None]:
weather_2012 = pd.concat(data_by_month)

In [None]:
weather_2012

## Putting it together

Now we want to request the full code execution.

Let's put all that together, to prove how easy it is. 6 lines of magical pandas!

If you want to play around, try changing sum to max, numpy.median, or any other function you like.

In [None]:
@sy.syft_function(
    input_policy=sy.ExactMatch(
        month1df=ds.assets["weather1"], month2df=ds.assets["weather2"],
    ),
    output_policy=sy.SingleExecutionExactOutput(),
)
def get_2012_weather(month1df, month2df):
    # third party
    import pandas as pd

    month_dfs = [month1df, month2df]
    month_dfs = [x.dropna(axis=1) for x in month_dfs]
    return pd.concat(month_dfs)

Create and submit project

In [None]:
new_project = sy.Project(
    name="Pandas Chapter 5",
    description="Hi, get all weather data for 2012",
    members=[guest_client],
)
new_project

In [None]:
project = new_project.send()
assert isinstance(project, sy.service.project.project.Project)
project

In [None]:
project.create_code_request(get_2012_weather, guest_client)

In [None]:
assert len(guest_client.code.get_all()) == 1

In [None]:
assert len(project.events) == 1

In [None]:
assert isinstance(project.events[0], sy.service.project.project.ProjectRequest)

# Data owner: approve request

In [None]:
# syft absolute

In [None]:
root_datasite_client = server.login(email="info@openmined.org", password="changethis")

## Get notifications

In [None]:
notifications = root_datasite_client.notifications.get_all_unread()

In [None]:
notifications

In [None]:
project_notification = [
    x for x in notifications if issubclass(x.linked_obj.object_type, Project)
][0]

In [None]:
request = project_notification.link.events[0].request
func = request.code

In [None]:
func

In [None]:
get_col_user_function = func.run

In [None]:
real_data1, real_data2 = (
    root_datasite_client.datasets[-1].assets["weather1"].data,
    root_datasite_client.datasets[-1].assets["weather2"].data,
)

In [None]:
real_data1

In [None]:
real_result = get_col_user_function(month1df=real_data1, month2df=real_data2)

In [None]:
len(real_data1) + len(real_data2) == len(real_result)

In [None]:
result = request.approve()
assert isinstance(result, sy.SyftSuccess)

# Data scientist: compute result

In [None]:
guest_client = guest_datasite_client.login(email="jane@caltech.edu", password="abc123")

In [None]:
asset1, asset2 = (
    guest_client.datasets[0].assets["weather1"],
    guest_client.datasets[0].assets["weather2"],
)

In [None]:
guest_client.code[0].status

In [None]:
result_ptr = guest_client.code.get_2012_weather(month1df=asset1, month2df=asset2)
real_result = result_ptr.get()

In [None]:
weather_2012 = real_result

In [None]:
len(weather_2012)

It's slow and unnecessary to download the data every time, so let's save our dataframe for later use!

In [None]:
# TODO do we use this later, if so lets change to import tempdir for windows
weather_2012.to_csv("/tmp/weather_2012.csv")

And we're done!

In [None]:
server.land()