### User: Data Scientist

#### Goal:
- Select Italy and Canada trade datasets
- Perform a join between the two datasets on the `Commodity Code` column
- Perform analysis on the merged dataset

#### Summary:
- Select Italy and Canada trade datasets
- ETL the trade datasets
- Merge the two datasets on the `Commodity Code` column
- For each commodity calculate the export/import ratio
- Fetch all the commodities where the export/import ratio exceeds 10%

In [None]:
import syft as sy

# Select the united nations network
un_network = sy.network[0]

# Login into the network
un_network_client = un.login(email="sheldon@caltech.edu", password="bazinga")

In [2]:
# Let's quickly check the datasets available on the network
un_network_client.datasets

Unnamed: 0,Name,Tags,Description,Dtype,Id,Domain,Shape
0,breast_cancer,"[mri, breast cancer, dicoms]",Labelled image dataset of patients suffering d...,ImageClassificationDataset,56lkw24,WHO,"((25000, 300, 300), (25000))"
1,canada_trade_data,"[canada, trade, un, commodities]",This dataset represents aggregated trade stati...,DataFrame,f3s9h1m,Canada,"(25000, 22)"
2,netherlands_trade_data,"[netherlands, trade, commodities, export]",This dataset represents aggregated trade stati...,DataFrame,2kf3o5d,Netherlands,"(35000, 22)"
3,italy_trade_data,"[italy, trade, un, commodities, export, import]",This dataset represents aggregated trade stati...,DataFrame,42wk65l,Italy,"(30000, 22)"
4,us_trade_data,"[us, trade, un, commodities]",This dataset represents aggregated trade stati...,DataFrame,86pfgh1,United States,"(40000, 22)"


In [None]:
# Filter and select the Canada and the Italy trade datasets

ca_trade_dataset_ptr = un_network_client.datasets["f3s9h1m"]
it_trade_dataset_ptr = un_network_client.datasets["42wk65l"]

In [None]:
# Let's filter out the data for the columns we desire.

required_columns = [
    "Classification",
    "Commodity Code",
    "Commodity",
    "Trade Value (US$)",
    "Partner",
    "Commodity Code",
    "Trade Flow",
]

ca_dataset_ptr = ca_trade_dataset_ptr.select(columns=required_columns)
it_dataset_ptr = it_trade_dataset_ptr.select(columns=required_columns)

# In canada dataset filter out the rows where the `Partner` is `Italy`
ca_filtered_dataset_ptr = ca_dataset_ptr.filter(
    ca_filtered_dataset_ptr["Partner"] == "Italy"
)

# Similary, in italy dataset filter out the rows where the `Partner` is `Canada`
it_filtered_dataset_ptr = it_dataset_ptr.filter(
    ca_filtered_dataset_ptr["Partner"] == "Canada"
)

In [4]:
# Join the two datasets
merged_dataset_ptr = sympc.merge(
    left=ca_filtered_dataset_ptr,
    right=it_filtered_dataset_ptr,
    on="Commodity Code",
    how="inner",
    suffixes=("_ca", "_it"),
)

merged_dataset_ptr.column_description

Unnamed: 0,Column,Description,Private
0,Classification_ca,Commodity Classification (HS= Harmonized System),True
1,Commodity Code,HS Commodity Code,True
2,Commodity_ca,Description,True
3,Trade Value_ca,in US dollars,True
4,Partner_ca,Description,False
5,Trade Flow_ca,Description,False
6,Classification_it,Commodity Classification (HS= Harmonized System),True
7,Commodity_it,Description,True
8,Trade Value_it,in US dollars,True
9,Partner_it,Description,True


In [None]:
ca_imports_it_exports = merged_dataset_ptr.filter(
    merged_dataset_ptr["Partner_ca"] == "Imports"
)
ca_export_it_imports = merged_dataset_ptr.filter(
    merged_dataset_ptr["Partner_ca"] == "Exports"
)


# Select the commodities where the error rate is greater than 10%
commodities1_with_error_gt_10 = ca_imports_it_exports.filter(
    (ca_imports_it_exports["Trade Value_it"] / ca_imports_it_exports["Trade Value_ca"])
    > 0.1
).select(columns=["Commodity Code"])
commodities2_with_error_gt_10 = ca_export_it_imports.filter(
    (ca_export_it_imports["Trade Value_ca"] / ca_export_it_imports["Trade Value_it"])
    > 0.1
).select(columns=["Commodity Code"])

#### Awesome !!! We have successfully selected the commodity code where the import to export ratio is greater than 10%.

#### Dummy data

In [5]:
import pandas as pd
from enum import Enum


## Dummy Data Store
dataset_store = [
    {
        "Name": "breast_cancer",
        "Tags": ["mri", "breast cancer", "dicoms"],
        "Description": "Labelled image dataset of patients suffering different types of breast cancer",
        "Dtype": "ImageClassificationDataset",
        "Id": "56lkw24",
        "Domain": "WHO",
        "Shape": "((25000, 300, 300), (25000))",
    },
    {
        "Name": "canada_trade_data",
        "Tags": ["canada", "trade", "un", "commodities"],
        "Description": "This dataset represents aggregated trade statistics as reported by Canada about what it believes was imported/exported to/from its country in Feb 2021.",
        "Dtype": "DataFrame",
        "Id": "f3s9h1m",
        "Domain": "Canada",
        "Shape": "(25000, 22)",
    },
    {
        "Name": "netherlands_trade_data",
        "Tags": ["netherlands", "trade", "commodities", "export"],
        "Description": "This dataset represents aggregated trade statistics as reported by Netherlands about what it believes was imported/exported to/from its country in Feb 2021.",
        "Dtype": "DataFrame",
        "Id": "2kf3o5d",
        "Domain": "Netherlands",
        "Shape": "(35000, 22)",
    },
    {
        "Name": "italy_trade_data",
        "Tags": ["italy", "trade", "un", "commodities", "export", "import"],
        "Description": "This dataset represents aggregated trade statistics as reported by Italy about what it believes was imported/exported to/from its country in Feb 2021.",
        "Dtype": "DataFrame",
        "Id": "42wk65l",
        "Domain": "Italy",
        "Shape": "(30000, 22)",
    },
    {
        "Name": "us_trade_data",
        "Tags": ["us", "trade", "un", "commodities"],
        "Description": "This dataset represents aggregated trade statistics as reported by United States about what it believes was imported/exported to/from its country in Feb 2021.",
        "Dtype": "DataFrame",
        "Id": "86pfgh1",
        "Domain": "United States",
        "Shape": "(40000, 22)",
    },
]

dataset_store = pd.DataFrame(dataset_store)

class bcolors(Enum):
    HEADER = "\033[95m"
    OKBLUE = "\033[94m"
    OKCYAN = "\033[96m"
    OKGREEN = "\033[92m"
    WARNING = "\033[93m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"

d={
 'Column': {0: 'Classification_ca',
  1: 'Commodity Code',
  2: 'Commodity_ca',
  3: 'Trade Value_ca',
  4: 'Partner_ca',
  5: 'Trade Flow_ca',
  6: 'Classification_it',
  7: 'Commodity_it',
  8: 'Trade Value_it',
  9: 'Partner_it',
  10: 'Trade Flow_it'},
 'Description': {0: 'Commodity Classification (HS= Harmonized System)',
  1: 'HS Commodity Code',
  2: 'Description',
  3: 'in US dollars',
  4: 'Description',
  5: 'Description',
  6: 'Commodity Classification (HS= Harmonized System)',
  7: 'Description',
  8: 'in US dollars',
  9: 'Description',
  10: 'Description'},
 'Private': {0: True, 1: True, 2: True, 3: True, 4: False, 5: False, 6: True, 7: True, 8: True, 9: True, 10: False}}

merged_dataset_schema = pd.DataFrame.from_dict(d)