# Step0 : Imports

In [12]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# Step3 : Download the data

### 3.1 Download the data from figshare

In [2]:
# Trigger request to download data
article_id = "14096681"
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]            

### 3.2 Extract the zip file

In [3]:
# Create zip file with downloaded data

In [4]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.36 s, sys: 5.56 s, total: 8.92 s
Wall time: 2min 52s


In [5]:
# Extract data into output directory

In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 7.07 s, sys: 716 ms, total: 7.79 s
Wall time: 7.84 s


# Step4 : Combining data CSVs

In [7]:
%%time
import pandas as pd
excluded_files = ["figsharerainfall\\observed_daily_rainfall_SYD.csv"]
files = glob.glob('figsharerainfall/*.csv')
files = list(set(files) - set(excluded_files))
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall("/([^_]*)", file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: user 8min 58s, sys: 31.6 s, total: 9min 30s
Wall time: 9min 34s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |       MacOS          |  16GB   |    Apple M2 Air      |   Yes     |   6min 16s         |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     3min 31s       |
| Natalie    |              MacOS    |8GB|      1.4 GHz Quad-Core Intel Core i5     |Yes|        7min 8s    |
| Nikita    |    Windows              |  16GB   |   12th Gen Intel(R) Core(TM) i7-1255U | Yes |   12min 49s         |

Summary :

- Having a better processor, a single state drive (as opposed to a hard disk drive) and more memory (aka. RAM) are all factors that can decrease the amount of time taken to perform this task of combining multiple CSV files into one file

# Step5 : Perform Simple EDA

### 5.1 Investigate at least two approaches to reduce memory usage while performing the EDA (e.g., value_counts)

In [8]:
# Original combine data

In [9]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv")
print(df["lat_min"].value_counts())
print(df["lat_max"].value_counts())
print(df["lon_min"].value_counts())
print(df["lon_max"].value_counts())

-32.041885    9105987
-32.984293    9105987
-34.869110    9105987
-30.157068    4553010
-31.099476    4553010
               ...   
-33.487232     551880
-33.490981     551880
-36.281964     551880
-30.696652     551880
-30.700015     551880
Name: lat_min, Length: 86, dtype: int64
-32.041885    9105987
-32.984293    9105987
-34.869110    9105987
-29.214660    6070680
-33.000000    4690980
               ...   
-27.909065     551880
-30.700015     551880
-30.696652     551880
-33.490981     551880
-33.487232     551880
Name: lat_max, Length: 89, dtype: int64
144.375000    7589139
151.875000    7589139
148.125000    7589139
140.625000    7174959
153.125000    5794719
               ...   
144.140625     690300
142.734375     690300
141.328125     690300
143.750000     551880
153.750000     414180
Name: lon_min, Length: 78, dtype: int64
148.125000    7589139
151.875000    7589139
144.375000    7589139
154.375000    5794719
145.625000    5794719
               ...   
151.171875     690300


In [10]:
# Approach1 - Select just columns we use

In [11]:
%%time
use_cols = ['lat_min','lat_max','lon_min','lon_max']
df = pd.read_csv("figsharerainfall/combined_data.csv",usecols=use_cols)
print(df["lat_min"].value_counts())
print(df["lat_max"].value_counts())
print(df["lon_min"].value_counts())
print(df["lon_max"].value_counts())

-32.041885    9105987
-32.984293    9105987
-34.869110    9105987
-30.157068    4553010
-31.099476    4553010
               ...   
-33.487232     551880
-33.490981     551880
-36.281964     551880
-30.696652     551880
-30.700015     551880
Name: lat_min, Length: 86, dtype: int64
-32.041885    9105987
-32.984293    9105987
-34.869110    9105987
-29.214660    6070680
-33.000000    4690980
               ...   
-27.909065     551880
-30.700015     551880
-30.696652     551880
-33.490981     551880
-33.487232     551880
Name: lat_max, Length: 89, dtype: int64
144.375000    7589139
151.875000    7589139
148.125000    7589139
140.625000    7174959
153.125000    5794719
               ...   
144.140625     690300
142.734375     690300
141.328125     690300
143.750000     551880
153.750000     414180
Name: lon_min, Length: 78, dtype: int64
148.125000    7589139
151.875000    7589139
144.375000    7589139
154.375000    5794719
145.625000    5794719
               ...   
151.171875     690300


In [12]:
# Approach2 - Load data in  chunks

In [13]:
%%time
counts_lat_min = pd.Series(dtype=int)
counts_lat_max = pd.Series(dtype=int)
counts_lon_min = pd.Series(dtype=int)
counts_lon_max = pd.Series(dtype=int)
for chunk in pd.read_csv("figsharerainfall/combined_data.csv", chunksize=10_000_000):
    counts_lat_min = counts_lat_min.add(chunk["lat_min"].value_counts(), fill_value=0)
    counts_lat_max = counts_lat_max.add(chunk["lat_max"].value_counts(), fill_value=0)
    counts_lon_min = counts_lon_min.add(chunk["lon_min"].value_counts(), fill_value=0)
    counts_lon_max = counts_lon_max.add(chunk["lon_max"].value_counts(), fill_value=0)
print(counts_lat_min.astype(int))
print(counts_lat_max.astype(int))
print(counts_lon_min.astype(int))
print(counts_lon_max.astype(int))

-36.467390    1932840
-36.455696     965790
-36.420966    1242540
-36.281964     551880
-36.277805     551880
               ...   
-30.157068    4553010
-30.000000    2898000
-30.000000    2345490
-29.921967    1932840
-29.900000    1379700
Length: 86, dtype: int64
-36.000000    1379700
-35.532329    1932840
-35.100000    1379700
-35.020151    1242540
-35.000000    3725820
               ...   
-28.421053     689850
-28.354430     965790
-28.125000     965790
-27.909065     551880
-27.906064     551880
Length: 89, dtype: int64
140.62500    7174959
141.00000    1931580
141.09375    1104480
141.18750    2484540
141.25000    1517670
              ...   
152.81250    1932840
153.00000    1931580
153.12500    5794719
153.28125    1104480
153.75000     414180
Length: 78, dtype: int64
141.25000     965790
141.87500    5794719
142.03125    1104480
142.27500     965790
142.31250    2484540
              ...   
154.21875    1104480
154.37500    5794719
154.68750    1932840
155.00000    1931580


In [14]:
# Approach3 - Change data type

In [15]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','lon_min','lon_max']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','lon_min','lon_max']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 6001.33 MB
Memory usage with float32: 3000.67 MB


### 5.2 Compare run times on different machines within your team and summarize your observations.

Original data :
| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |   MacOS          |  16GB   |    Apple M2 Air      |   Yes     |   1min 6s         |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |    39.3s        |
| Natalie    |    MacOS    |8GB|      1.4 GHz Quad-Core Intel Core i5     |Yes|           1m 47s |
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

Approach1 - Select just columns we use :
| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |     MacOS          |  16GB   |    Apple M2 Air      |   Yes     |   38.8s         |            
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     23.9s       |
| Natalie    |   MacOS    |8GB|      1.4 GHz Quad-Core Intel Core i5     |Yes|1m 6s|
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

Approach2 - Load data in chunks :
| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |     MacOS          |  16GB   |    Apple M2 Air      |   Yes     |   57.3s|          
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     32.1s       |
| Natalie    |    MacOS    |8GB|      1.4 GHz Quad-Core Intel Core i5     |Yes|1m 11s|
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

Summary :

- In terms of run time, the approach of selecting just the columns needed was the fastest, as opposed to loading the data in chunks
- With the size of our dataset, loading the data in chunks had a similar run time to simply loading the original data in. This is most likely due to the fact that the amount of data (i.e., the number of columns and rows) is the same in approach 2 but for approach 1, we are simply selecting the columns we need and thus less data is being loaded in.

# Step6 : Perform Simple EDA in R

In [2]:
%reset -f
%load_ext rpy2.ipython

In [3]:
filepathcsv = "figsharerainfall/combined_data.csv"
filepathparquet = "figsharerainfall/combined_data.parquet"
filepathparquetr = "figsharerainfall/combined_data_r.parquet"

In [4]:
#!pip install rpy2_arrow
import pyarrow.dataset as ds
import pyarrow as pa
import pandas as pd
import pyarrow 
from pyarrow import csv
import rpy2_arrow.pyarrow_rarrow as pyra

In [5]:
%%time
dataset = ds.dataset(filepathcsv, format="csv")
# Converting the `pyarrow dataset` to a `pyarrow table`
table = dataset.to_table()
# Converting a `pyarrow table` to a `rarrow table`
r_table = pyra.converter.py2rpy(table)

CPU times: user 23.2 s, sys: 4.16 s, total: 27.3 s
Wall time: 25.7 s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |  MacOS          |  16GB   |    Apple M2 Air      |   Yes     |   23.9s|              
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |            |
| Natalie    |    MacOS    |8GB|      1.4 GHz Quad-Core Intel Core i5     |Yes|25.7s|
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

In [21]:
%%R -i r_table
print(head(r_table) %>% collect())

# A tibble: 6 × 7
  time                lat_min lat_max lon_min lon_max `rain (mm/day)` model     
  <dttm>                <dbl>   <dbl>   <dbl>   <dbl>           <dbl> <chr>     
1 1889-01-01 04:00:00   -36.2     -35    141.    142.        3.29e-13 ACCESS-CM2
2 1889-01-02 04:00:00   -36.2     -35    141.    142.        0        ACCESS-CM2
3 1889-01-03 04:00:00   -36.2     -35    141.    142.        0        ACCESS-CM2
4 1889-01-04 04:00:00   -36.2     -35    141.    142.        0        ACCESS-CM2
5 1889-01-05 04:00:00   -36.2     -35    141.    142.        1.05e- 2 ACCESS-CM2
6 1889-01-06 04:00:00   -36.2     -35    141.    142.        3.29e- 2 ACCESS-CM2


In [14]:
%%time
%%R -i r_table

# Counting the number of rows with each unique value in the lat_min column
suppressMessages(library(dplyr))
result <- r_table %>% count(lat_min)
end_time <- Sys.time()
print(result %>% collect())

# A tibble: 85 × 2
   lat_min       n
     <dbl>   <int>
 1   -36.2  322140
 2   -35    782040
 3   -33.8  966210
 4   -32.5  322140
 5   -31.2  322140
 6   -30   1747830
 7   -35.9  459900
 8   -34.9  459900
 9   -33.9  459900
10   -32.9  459900
# ℹ 75 more rows
# ℹ Use `print(n = ...)` to see more rows
CPU times: user 2.73 s, sys: 2.47 s, total: 5.2 s
Wall time: 1.9 s


In [34]:
%%time
%%R -i r_table

# Counting the number of rows with each unique value in the lat_max column
suppressMessages(library(dplyr))
result <- r_table %>% count(lat_max)
end_time <- Sys.time()
print(result %>% collect())

# A tibble: 88 × 2
   lat_max       n
     <dbl>   <int>
 1   -35   1241940
 2   -33.8  966210
 3   -32.5  322140
 4   -31.2  322140
 5   -30   2529870
 6   -28.8  322140
 7   -35.1  459900
 8   -34.1  459900
 9   -33.1  459900
10   -32.1  459900
# ℹ 78 more rows
# ℹ Use `print(n = ...)` to see more rows
CPU times: user 2.06 s, sys: 1.95 s, total: 4.01 s
Wall time: 1.41 s


In [59]:
%%time
%%R -i r_table

# Counting the number of rows with each unique value in the lon_max column
suppressMessages(library(dplyr))
result <- r_table %>% count(lon_max)
end_time <- Sys.time()
print(result %>% collect())

# A tibble: 80 × 2
   lon_max       n
     <dbl>   <int>
 1    142. 1103940
 2    148. 2529713
 3    152. 2529713
 4    154. 1104030
 5    144. 2529713
 6    150  1242000
 7    146. 1425960
 8    142.  321930
 9    145.  321930
10    146.  321930
# ℹ 70 more rows
# ℹ Use `print(n = ...)` to see more rows
CPU times: user 5.34 s, sys: 2.37 s, total: 7.71 s
Wall time: 2.32 s


In [60]:
%%time
%%R -i r_table

# Counting the number of rows with each unique value in the lon_min column
suppressMessages(library(dplyr))
result <- r_table %>% count(lon_min)
end_time <- Sys.time()
print(result %>% collect())

# A tibble: 79 × 2
   lon_min       n
     <dbl>   <int>
 1    141. 2391653
 2    142.  920070
 3    144. 2529713
 4    146. 1104030
 5    148. 2529713
 6    152. 2529713
 7    150   920070
 8    143.  321930
 9    144.  321930
10    145.  321930
# ℹ 69 more rows
# ℹ Use `print(n = ...)` to see more rows
CPU times: user 5.58 s, sys: 975 ms, total: 6.56 s
Wall time: 1.2 s


In [22]:
%%R -i r_table
suppressMessages(library(dplyr))
summary(r_table)

              Length    Class        Mode       
time          187541589 ChunkedArray environment
lat_min       187541589 ChunkedArray environment
lat_max       187541589 ChunkedArray environment
lon_min       187541589 ChunkedArray environment
lon_max       187541589 ChunkedArray environment
rain (mm/day) 187541589 ChunkedArray environment
model         187541589 ChunkedArray environment


We decided to proceed with Arrow exchange to transfer data frame from Python to R because it is fast and efficient at transferring data across different languages. It also avoids the data to be copied from one buffer to another, therefore, there is no CPU is used in this process compared to Parquet or Pandas exchange. Also, the data types present in out data frame are fully supported by Arrow exchange. In addition, dplyr and many other R packages have great integration with Arrow, so it will be the best to work in R. 

Pandas exchange is not very suitable for large data files like this one since it requires loading the entire set into the memory all at once which may take really long time to process and there is no guarantee that is would be able to process the transfer due to its limitations. Parquet files are generally more complex and harder to work with due to their structure.

**Short EDA Summary:**
- The total number of rows (or the length of each column) in the data frame is: 187541589.
- ChunkedArray is the class of each column, which means that each column is a large array that is split into chunks for more efficient processing.
- 'environment' is the mode of each column, which means that each column is stored in memory as an R environment.
- The columns are: "time" (timestamp), "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)" (numeric), "model" (string).
- There are 87 unique values in the `lat_min` column.
- There are 90 unique values in the `lat_max` column.