In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
import rpy2.rinterface
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.feather as feather
import matplotlib as plt

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler



3. Downloading the data
rubric={correctness:10}

Download the data from figshare to your local computer using the figshare API (you can make use of requests library).
Extract the zip file, again programmatically, similar to how we did it in class.
You can download the data and unzip it manually. But we learned about APIs, and so we can do it in a reproducible way with the requests library, similar to how we did it in class.

There are 5 files in the figshare repo. The one we want is: data.zip

In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "aus_rainfall/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [5]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 45 s


In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 37.2 s


In [8]:
files = glob.glob('aus_rainfall/*.csv')
combined_data = pd.concat((pd.read_csv(file, index_col=0,)
                .assign(model = os.path.basename(file).rsplit('daily')[0].replace('_', ''))
                for file in files)
              )

MemoryError: Unable to allocate 1.00 MiB for an array with shape (131072,) and data type int64

In [None]:
%%time
%%memit
combined_data = pd.read_csv("aus_rainfall/combined_data.csv")
print(combined_data["model"].value_counts())

In [None]:
%%time
%%memit
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("aus_rainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts)

Took these from Yiki's notebook

In [None]:
%%time
%%memit

df = pd.read_csv("aus_rainfall/combined_data.csv")
df.head(5)

In [None]:
%%time
%%memit

usecols =  ['time', 'rain (mm/day)', 'model']
#simple pandas - This is how we do normally ,which means we are loading the entire data to the memory
climate_data = pd.read_csv("aus_rainfall/combined_data.csv", usecols= usecols)
climate_data.head(5)

In [None]:
!pip install matplotlib 

In [None]:
%%time
%%memit
climate_data['time'] = pd.to_datetime(climate_data['time'])
climate_data[['time','rain (mm/day)']].plot.scatter(x = 'time', y = 'rain (mm/day)', alpha = .01)

Checking for seasonality in our data

In [None]:
%%time
%%memit
climate_data['time'] = pd.to_datetime(climate_data['time'])
climate_data['month'] = climate_data.time.dt.month
climate_data.groupby("month").mean('rainfall (mm/day)')[['rain (mm/day)']].plot()

In [None]:
%%R
# call library
library("arrow")
library("tidyverse")
library('lubridate')

In [None]:
%%R -i combined_data
start_time <- Sys.time()
fig <- ggplot(data = combined_data, aes(x = time, y = `rain (mm/day)`)) +
  geom_hex() +
  ggtitle('Most Days in Australia Have Very Little Rainfall')
end_time <- Sys.time()
print(fig)
print(end_time - start_time)

In [None]:
%%R
start_time <- Sys.time()
fig <- combined_data %>%
  mutate(month_of_year = month(time, label = TRUE)) %>%
  group_by(month_of_year) %>%
  summarise(monthly_rainfall = mean(`rain (mm/day)`)) %>%
  ggplot(aes(x = month_of_year, y = monthly_rainfall)) +
    geom_col() +
    ggtitle('On Average, Rainfall Seems to Be Lowest in Australia \n During the Austral Autumn and Winter')
end_time <- Sys.time()
print(fig)
print(end_time - start_time)

In [None]:
%%R
start_time <- Sys.time()
print(summary(combined_data))
end_time <- Sys.time()
print(end_time - start_time)

In [None]:
%%R 
start_time <- Sys.time()
fig <- combined_data %>%
  mutate(year = year(time)) %>%
  group_by(year, model) %>%
  summarise(rainfall = mean(`rain (mm/day)`)) %>%
  ggplot(aes(x = year, y = rainfall, color = model)) +
    geom_line() +
    ggtitle('Average Yearly Rainfall Differst Quite a Bit Between Models')
print(fig)
end_time <- Sys.time()
print(end_time - start_time)