In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# Downloading the data

Using Python **requests** Library

We are using article id #14096681, which contains the data of **Daily rainfall over NSW, Australia.**

In [8]:
# Setup
article_id = 14096681  
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "rainfall/"

Review the files within the article:

In [13]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [14]:
%%time

files_to_dl = ["data.zip"]  
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.69 s, sys: 3.43 s, total: 7.12 s
Wall time: 1min 10s


In [15]:
%%time

files_to_dl = ["data.zip"]  
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.73 s, sys: 3.49 s, total: 7.22 s
Wall time: 1min 47s


In [22]:
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

In [23]:
%ls -ltr rainfall/

total 12223424
-rw-r--r--   1 Rada  staff  814041183 28 Mar 14:47 data.zip
-rw-r--r--   1 Rada  staff   95376895 28 Mar 14:48 MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff   94960113 28 Mar 14:48 AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff   82474546 28 Mar 14:48 NorESM2-LM_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  127613760 28 Mar 14:48 ACCESS-CM2_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  232118894 28 Mar 14:48 FGOALS-f3-L_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  330360682 28 Mar 14:48 CMCC-CM2-HR4_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  254009247 28 Mar 14:48 MRI-ESM2-0_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  235661418 28 Mar 14:48 GFDL-CM4_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  294260911 28 Mar 14:48 BCC-CSM2-MR_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  295768615 28 Mar 14:48 EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 Rada  staff  328852379 28 Mar 14:48 CMCC-ES

# Combining data CSVs

- Combine data CSVs into a single CSV using pandas.

- When combining the CSV files, add an extra column called "model" that identifies the model. Tip 1: you can get this column populated from the file name, eg: for file name "SAM0-UNICON_daily_rainfall_NSW.csv", the model name is SAM0-UNICON Tip 2: Remember how we added year when we combined airline CSVs. Tip 3: You can use regex generator.

_Note: There is a file called observed_daily_rainfall_SYD.csv in the data folder that you downloaded. Make sure you exclude this file (programmatically or just take out that file from folder) before you combine CSVs. We will use this file in our next milestone._

- Compare run times on different machines within your team and summarize your observations.
Warning: Some of you might not be able to do it on your laptop. It's fine if you're unable to do it. Just make sure you discuss the reasons why you might not have been able to run this on your laptop.

Let's first view the data and the columns:

In [33]:
%%time

df_1 = pd.read_csv(output_directory+"/MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv")
df_2 = pd.read_csv(output_directory+"/CMCC-CM2-SR5_daily_rainfall_NSW.csv")
df_3 = pd.read_csv(output_directory+"/SAM0-UNICON_daily_rainfall_NSW.csv")

CPU times: user 7.54 s, sys: 919 ms, total: 8.46 s
Wall time: 8.6 s


Even loading three of the individual files is taking a little time.

In [34]:
df_1.head(2)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13


In [35]:
df_2.head(2)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-35.811518,-34.86911,140.625,141.875,0.000424
1,1889-01-02 12:00:00,-35.811518,-34.86911,140.625,141.875,0.006158


In [37]:
df_3.head(2)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-35.811518,-34.86911,140.625,141.875,3.04565e-13
1,1889-01-02 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0003572392


In [36]:
df_3.tail(2)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
3541151,2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,8.541592
3541152,2014-12-31 12:00:00,-30.157068,-29.21466,153.125,154.375,68.117489


In [52]:
%%time

files = glob.glob('rainfall/*NSW.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'/([^_]*)', file)[0])
                for file in files)
              )
df.to_csv("rainfall/combined_data.csv")

CPU times: user 7min 51s, sys: 23.2 s, total: 8min 14s
Wall time: 8min 26s


Wow, this felt like an eternity!

Let's take a look at the combined file, see if head and tail are as we expect them to be:

In [31]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),year
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM_daily_rainfall_NSW
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM_daily_rainfall_NSW
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM_daily_rainfall_NSW
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM_daily_rainfall_NSW
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM_daily_rainfall_NSW


In [32]:
df.tail()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),year
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-27 12:00:00,-30.157068,-29.21466,153.125,154.375,6.689683,SAM0-UNICON_daily_rainfall_NSW
2014-12-28 12:00:00,-30.157068,-29.21466,153.125,154.375,7.862555,SAM0-UNICON_daily_rainfall_NSW
2014-12-29 12:00:00,-30.157068,-29.21466,153.125,154.375,10.005026,SAM0-UNICON_daily_rainfall_NSW
2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,8.541592,SAM0-UNICON_daily_rainfall_NSW
2014-12-31 12:00:00,-30.157068,-29.21466,153.125,154.375,68.117489,SAM0-UNICON_daily_rainfall_NSW


# Load the combined CSV to memory and perform a simple EDA

1. Investigate at least two of the following approaches to reduce memory usage while performing the EDA (e.g., value_counts).

- Changing dtype of your data
- Load just columns what we want
- Loading in chunks
- Dask

2. Compare run times on different machines within your team and summarize your observations.

### Changing dtypes of data:

- We will attempt to change time column from datetime to date
- We will attempt to read the numerical columns using float32 format

Memory comparison for format changes adapted from Lecture notes:

In [None]:
df.index = pd.to_datetime(df.index).dt.date

AttributeError: 'DatetimeIndex' object has no attribute 'dt'

In [40]:
print(f"Memory usage with float64: {df[['lat_min','lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 6001.33 MB
Memory usage with float32: 3500.78 MB


### Dask:

- We will attempt to read dataframe using dask

In [50]:
import dask.dataframe as dd
from memory_profiler import memory_usage
%load_ext memory_profiler


In [51]:
%%time
%%memit

dask_df = dd.read_csv("rainfall/combined_data.csv")
print(dask_df["model"].value_counts().compute())

KeyError: 'model'

### Loading in Chunks:

- We will attempt to read dataframe in chunks

In [None]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("rainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk[(chunk.model ==2004) & (chunk.ArrDelay >10)]["UniqueCarrier"].value_counts(), fill_value=0)
print(counts.astype(int))

### Selecting columns:

For this problem, it doesn't make sense to only select certain columns. Each column seems to be significant to the data lat_min, lat_max, lon_min, lon_max specify area, ranfall specified amount precipitation, and time the time of observation. It seems that regardless what we want to do with the data, all columns are significant. Once imported, we could maybe combine some, but can not drop them during importing.

# Perform a simple EDA in R