# Dataset: _reports_

In [None]:
import pandas as pd 
import requests, zipfile, io, json, re

import src.utils as ut

# Setup the root path of the application
project_path = ut.project_path()

# Get contentUrl from metadata file
meta_filename = f'{project_path}/meta/mosquito_alert/reports.json'
ut.info_meta(meta_filename)

## 1. Distribution from Zenodo cloud

Pay attention to get the url of the most recent version of the dataset. Below
we give an url just as an example, but probably it is not pointing to the
most recent dataset version. However, if the dataset link is not pointing to 
the last dataset version, Zenodo issues a waring and a link to the most recent
version.

In [None]:
# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    meta_filename, idx_distribution=0, idx_hasPart=None)

# Make folders for data download
path = f'{project_path}/data/{dataset_name}/{distr_name}'
ut.makedirs(path)

In [None]:
# Download and open the zip container
r = requests.get(contentUrl)
z = zipfile.ZipFile(io.BytesIO(r.content))

We have the option to extract all the file reports into a distribution folder.

In [None]:
z.extractall(path)

Or we could just concatenate all reports into a single dataframe and save
it on a file.

In [None]:
# Merge all reports into a dataframe
df_reports = []
reports = [s for s in z.namelist() if (s.find('all_reports') != -1)]
for name in reports:
    f = z.open(name)
    d = json.loads(f.read())
    df_reports.append(pd.DataFrame.from_records(d, coerce_float=True))

df = pd.concat(df_reports)
df.info()

Some report attributes are key-value json-like data, that need additional
tables to be fully comprehensive. For example, for “tiger_respones”,
since multi language translations are available, we make language as index

In [None]:
reports_translation = [s for s in z.namelist() if (s.find('translation_dict') != -1)]

f = z.open(reports_translation[0])
r = f.read()

try:
    d = json.loads(r)
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", "\" }", r.decode('utf-8'))
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient='index')
df_reports_translation.info()

In [None]:
# Save reports on CSV or parquet
filename = f'{path}/all_reports'
df.to_parquet(f'{filename}.parquet') # very low file-size (need to install pyArrow)
df.to_csv(f'{filename}.csv') # x10 size if compared with the dataframe

# Save seports translation on CSV
df_reports_translation.to_csv(f'{filename}_translation.csv')

## 2. Distribution from MosquitoAlert Github repository

In contrast with the Zenodo distribution, the dataset stored on GitHub server is
always the latest available version since it is daily loaded into Zenodo. 

In [None]:
# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    meta_filename, idx_distribution=1, idx_hasPart=None)

# Make folders for data download
path = f'{project_path}/data/{dataset_name}/{distr_name}'
ut.makedirs(path)

In [None]:
# Request reports in json format and concatenate all of them into a dataframe
df_reports = []
for url in contentUrl[:-1]:
    r = requests.get(url)
    d = r.json()
    df_reports.append(pd.DataFrame.from_records(d, coerce_float=True))

df = pd.concat(df_reports)
df.info()

In [None]:
# Request other support material of the reports and put them into dataframes
# Since multilanguage translations are available, we make language as index

url = contentUrl[-1]
r = requests.get(url)
try:
    d = r.json()
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", "\" }", r.text)
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient='index')
df_reports_translation.info()

In [None]:
# Save reports on CSV or parquet
filename = f'{path}/all_reports'
df.to_parquet(f'{filename}.parquet') # very low file-size (need to install pyArrow)
df.to_csv(f'{filename}.csv') # x10 size if compared with the dataframe

# Save seports translation on CSV
df_reports_translation.to_csv(f'{filename}_translation.csv')