# DSCI: 525 Milestone 1 - Group 8

### Imports

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

### Downloading the data

In [3]:
# Santiago

In [4]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

In [5]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

### Unzipping Data

In [6]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 2.54 s, sys: 2.28 s, total: 4.82 s
Wall time: 1min 11s


In [7]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 18.2 s, sys: 2.6 s, total: 20.8 s
Wall time: 21.7 s


### Combining data CSVs

In [8]:
# Santiago

In [9]:
df = pd.read_csv("./figshareairline/ACCESS-CM2_daily_rainfall_NSW.csv")
df

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.00,140.625,142.50,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
2,1889-01-03 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
3,1889-01-04 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
4,1889-01-05 12:00:00,-36.25,-35.00,140.625,142.50,1.047658e-02
...,...,...,...,...,...,...
1932835,2014-12-27 12:00:00,-30.00,-28.75,151.875,153.75,2.951144e-02
1932836,2014-12-28 12:00:00,-30.00,-28.75,151.875,153.75,2.257118e-01
1932837,2014-12-29 12:00:00,-30.00,-28.75,151.875,153.75,1.204670e-01
1932838,2014-12-30 12:00:00,-30.00,-28.75,151.875,153.75,2.632404e-02


In [10]:
%%time
%memit
# Shows time that regular python takes to merge file
# Join all data together
## here we are using a normal python way of merging the data 

files = glob.glob('figshareairline/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'/([^_]*)', file)[0])
                for file in files)
              )
df.to_csv("figshareairline/combined_data.csv")

peak memory: 391.25 MiB, increment: 0.19 MiB
CPU times: user 10min 50s, sys: 46.9 s, total: 11min 36s
Wall time: 12min 1s


In [11]:
df_combined = pd.read_csv("./figshareairline/combined_data.csv")
df_combined

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
...,...,...,...,...,...,...,...
126894313,2014-12-27 12:00:00,-30.157068,-29.214660,153.1250,154.3750,6.689683e+00,SAM0-UNICON
126894314,2014-12-28 12:00:00,-30.157068,-29.214660,153.1250,154.3750,7.862555e+00,SAM0-UNICON
126894315,2014-12-29 12:00:00,-30.157068,-29.214660,153.1250,154.3750,1.000503e+01,SAM0-UNICON
126894316,2014-12-30 12:00:00,-30.157068,-29.214660,153.1250,154.3750,8.541592e+00,SAM0-UNICON


In [12]:
df_combined["model"].unique()

array(['MPI-ESM-1-2-HAM', 'AWI-ESM-1-1-LR', 'NorESM2-LM', 'ACCESS-CM2',
       'FGOALS-f3-L', 'CMCC-CM2-HR4', 'MRI-ESM2-0', 'GFDL-CM4',
       'BCC-CSM2-MR', 'EC-Earth3-Veg-LR', 'CMCC-ESM2', 'NESM3',
       'MPI-ESM1-2-LR', 'ACCESS-ESM1-5', 'FGOALS-g3', 'INM-CM4-8',
       'MPI-ESM1-2-HR', 'TaiESM1', 'NorESM2-MM', 'CMCC-CM2-SR5',
       'combined', 'observed', 'KIOST-ESM', 'INM-CM5-0', 'MIROC6',
       'BCC-ESM1', 'GFDL-ESM4', 'CanESM5', 'SAM0-UNICON'], dtype=object)

In [13]:
%%sh
du -sh figshareairline/combined_data.csv

 11G	figshareairline/combined_data.csv


### Load the combined CSV to memory and perform a simple EDA

In [14]:
# Daniel and Rui

In [15]:
%%time
%%memit
#simple pandas - This is how we do normally ,which means we are loading the entire data to the memory
df = pd.read_csv("figshareairline/combined_data.csv")
print(df["model"].value_counts())

combined            64380455
MPI-ESM1-2-HR        5154240
NorESM2-MM           3541230
CMCC-CM2-HR4         3541230
CMCC-ESM2            3541230
TaiESM1              3541230
CMCC-CM2-SR5         3541230
SAM0-UNICON          3541153
FGOALS-f3-L          3219300
GFDL-CM4             3219300
GFDL-ESM4            3219300
EC-Earth3-Veg-LR     3037320
MRI-ESM2-0           3037320
BCC-CSM2-MR          3035340
MIROC6               2070900
ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
INM-CM4-8            1609650
INM-CM5-0            1609650
KIOST-ESM            1287720
FGOALS-g3            1287720
MPI-ESM1-2-LR         966420
MPI-ESM-1-2-HAM       966420
AWI-ESM-1-1-LR        966420
NESM3                 966420
NorESM2-LM            919800
BCC-ESM1              551880
CanESM5               551880
observed               46020
Name: model, dtype: int64
peak memory: 8913.19 MiB, increment: 4372.68 MiB
CPU times: user 2min 3s, sys: 49.4 s, total: 2min 53s
Wall time: 3min 24s


In [16]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [17]:
#checking datatypes for columns
df.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

### Perform a simple EDA in R

In [6]:
# Rachel and Rui

In [18]:
# feather
import pyarrow.feather as feather
import pyarrow.dataset as ds

In [19]:
%%time
%%memit
dataset = ds.dataset("figshareairline/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

peak memory: 5365.77 MiB, increment: 562.96 MiB
CPU times: user 40.6 s, sys: 36.1 s, total: 1min 16s
Wall time: 1min 9s


In [24]:
%%time
# writing in feather format
feather.write_feather(table, 'figshareairline/combined_data.feather')

CPU times: user 11.3 s, sys: 44.4 s, total: 55.7 s
Wall time: 40.2 s


In [25]:
%%time
%%R
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshareairline/combined_data.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model) # showing the different counts of the models 
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 29 x 2[39m
   model                 n
   [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2      1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5   1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR   [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR     3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1         [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5          [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4    3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5    3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2       3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m combined       64[4m3[24m[4m8[24m[4m0[24m455
[90m# … with 19 more rows[39m
Time difference of 1.392174 mins
CPU times: user 23 s, sys: 1min 12s, total: 1min 35s
Wall time: 1min 24s


In [40]:
%%R
result <- r_table %>% count(time) # showing the different counts of the time
print(result)

[90m# A tibble: 92,040 x 2[39m
   time                    n
   [3m[90m<dttm>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m 1888-12-31 [90m16:00:00[39m    58
[90m 2[39m 1889-01-01 [90m04:00:00[39m  [4m2[24m701
[90m 3[39m 1889-01-01 [90m16:00:00[39m    58
[90m 4[39m 1889-01-02 [90m04:00:00[39m  [4m2[24m701
[90m 5[39m 1889-01-02 [90m16:00:00[39m    58
[90m 6[39m 1889-01-03 [90m04:00:00[39m  [4m2[24m701
[90m 7[39m 1889-01-03 [90m16:00:00[39m    58
[90m 8[39m 1889-01-04 [90m04:00:00[39m  [4m2[24m701
[90m 9[39m 1889-01-04 [90m16:00:00[39m    58
[90m10[39m 1889-01-05 [90m04:00:00[39m  [4m2[24m701
[90m# … with 92,030 more rows[39m


In [None]:
# stuff below here is to show the mean and mode and median of the different numeric columns

In [41]:
%%R
library(tidyr)

In [42]:
%%R
r_table_d <- r_table %>% drop_na() # drop NA values

In [47]:
%%R
r_table_d <- r_table_d %>% rename(rain_mmperday = `rain (mm/day)`) # rename the column for rain

In [48]:
%%R
Columns <- c("lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/perday)")
Mean <- c(mean(r_table_d$lat_min), mean(r_table_d$lat_max), mean(r_table_d$lon_min), mean(r_table_d$lon_max), mean(r_table_d$rain_mmperday))
Mode <- c(mode(r_table_d$lat_min), mode(r_table_d$lat_max), mode(r_table_d$lon_min), mode(r_table_d$lon_max), mode(r_table_d$rain_mmperday))
Median <- c(median(r_table_d$lat_min), median(r_table_d$lat_max), median(r_table_d$lon_min), median(r_table_d$lon_max), median(r_table_d$rain_mmperday))

result <- data.frame(Columns, Mean, Mode, Median)
print(result)

           Columns       Mean    Mode       Median
1          lat_min -33.113129 numeric -33.00000000
2          lat_max -31.919741 numeric -32.00000000
3          lon_min 146.907484 numeric 146.87500000
4          lon_max 148.297044 numeric 148.12500000
5 rain (mm/perday)   1.899024 numeric   0.06020046


In [None]:
# could format the table a bit nicer and potentially round values if we wanted to ^