# Milestone 1
ggroup 5_

_Authors: Vignesh, Dustin, Aidan, Javairia_

## Section 1: Download the Data

In [None]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"

response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    #print(file)
    if file["name"] in files_to_dl:
        print(file['name'])
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])


with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

data.zip


## Section 2: Combining the Data with Dask 

In [None]:
import dask.dataframe as dd

In [None]:
# Select column names
use_cols = ['time', 'lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']

# Get extension for all files
all_files = "../data/*NSW.csv"

# Combine all files
ddf = dd.read_csv(all_files, assume_missing=True, usecols=use_cols, include_path_column=True)

# Create model column
ddf['model'] = ddf['path'].str.split("/", expand=True, n=10)[10].str.split("_", expand=True, n=3)[0]

# Drop path column
ddf.drop(['path'], axis=1)

# Write combined data to single file
ddf.to_csv("../data/combined_NSW.csv", single_file=True)

## Section 3: Loading the combined CSV to memory

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%load_ext rpy2.ipython
%load_ext memory_profiler

In [None]:
!pip install matplotlib

### Loading in Chunks

In [None]:
def get_counts(column, file = "../data/ACCESS-CM2_daily_rainfall_NSW.csv"):
    counts = pd.Series(dtype=int)

    for chunk in pd.read_csv(file, chunksize=10_000):
        counts = counts.add(chunk[column].value_counts(), fill_value=0)

    return counts

In [None]:
chunk_mem = %memit -o get_counts('lat_max')
chunk_time = %timeit -o get_counts('lat_max')

### Dask

In [None]:
dask_mem = %memit -o dd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv")['lat_max'].value_counts().compute()
dask_time = %timeit -o dd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv")['lat_max'].value_counts().compute()

### Loading only columns of interest

In [None]:
col_subset_mem = %memit -o pd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=['lat_max'])['lat_max'].value_counts()
col_subset_time = %timeit -o pd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=['lat_max'])['lat_max'].value_counts()

### Loading with `low_memory=True`

In [None]:
low_mem = %memit -o pd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=['lat_max'],low_memory=True)['lat_max'].value_counts()
low_time = %timeit -o pd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=['lat_max'],low_memory=True)['lat_max'].value_counts()

## Comparison

In [None]:
analysis = pd.DataFrame({
    "Method": ["Chunking", "Dask", "Subsetting Columns", "Low Memory"],
    "Time": [np.mean(chunk_time.all_runs), np.mean(dask_time.all_runs), np.mean(col_subset_time.all_runs), np.mean(low_time.all_runs)],
    "Memory": [chunk_mem.mem_usage[0], dask_mem.mem_usage[0], col_subset_mem.mem_usage[0], low_mem.mem_usage[0]]
})

analysis

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
sc = ax.scatter(analysis.Time, analysis.Memory, c = pd.Categorical(analysis.Method).codes, cmap='Dark2')
ax.legend(sc.legend_elements()[0], analysis.Method, title="Method")
plt.show()

## Discussion

From the experiments, The slowest was chunking. This makes sense since we would have to do multiple iterations to get through the entire data. However, the trade off was the amount of memory used: Chunking used the least amount of memory among all methods tested. 

The fastest method was loading only the column of interest. This was faster than using Dask. The use of `low_memory=True` did not affect the memory usage by much. If we were to pick one of these methods, it would be the column sub-setting method. 