# Step0 : Imports

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# Step3 : Download the data

### 3.1 Download the data from figshare

In [2]:
# Trigger request to download data
article_id = "14096681"
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]            

### 3.2 Extract the zip file

In [5]:
# Create zip file with downloaded data

In [4]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.58 s, sys: 11.5 s, total: 15.1 s
Wall time: 2min 10s


In [6]:
# Extract data into output directory

In [7]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 7.65 s, sys: 1.23 s, total: 8.88 s
Wall time: 9.29 s


# Step4 : Combining data CSVs

In [8]:
%%time
import pandas as pd
excluded_files = ["figsharerainfall\\observed_daily_rainfall_SYD.csv"]
files = glob.glob('figsharerainfall/*.csv')
files = list(set(files) - set(excluded_files))
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall("/([^_]*)", file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: user 3min 17s, sys: 11.8 s, total: 3min 29s
Wall time: 3min 31s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |                  |     |           |        |            |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     3min 31s       |
| Natalie    |                  |     |           |        |            |
| Nikita    |    Windows              |  16GB   |   12th Gen Intel(R) Core(TM) i7-1255U | Yes |   12min 49s         |

# Step5 : Perform Simple EDA

### 5.1 Investigate at least two approaches to reduce memory usage while performing the EDA (e.g., value_counts.

In [9]:
# Original combine data

In [18]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv")
print(df["lat_min"].value_counts())
print(df["lat_max"].value_counts())
print(df["lon_min"].value_counts())
print(df["lon_max"].value_counts())

-34.869110    3035329
-32.984293    3035329
-32.041885    3035329
-35.811518    1517670
-33.926702    1517670
               ...   
-36.281964     183960
-33.490981     183960
-30.700015     183960
-30.696652     183960
-36.277805     183960
Name: lat_min, Length: 86, dtype: int64
-34.869110    3035329
-32.984293    3035329
-32.041885    3035329
-29.214660    2023560
-33.000000    1563660
               ...   
-27.909065     183960
-33.487232     183960
-30.696652     183960
-27.906064     183960
-30.700015     183960
Name: lat_max, Length: 89, dtype: int64
144.375000    2529713
151.875000    2529713
148.125000    2529713
140.625000    2391653
149.375000    1931573
               ...   
142.734375     230100
141.328125     230100
148.359375     230100
143.750000     183960
153.750000     138060
Name: lon_min, Length: 78, dtype: int64
144.375000    2529713
148.125000    2529713
151.875000    2529713
141.875000    1931573
150.625000    1931573
               ...   
146.953125     230100


In [19]:
# Approach1 - Select just columns we use

In [21]:
%%time
use_cols = ['lat_min','lat_max','lon_min','lon_max']
df = pd.read_csv("figsharerainfall/combined_data.csv",usecols=use_cols)
print(df["lat_min"].value_counts())
print(df["lat_max"].value_counts())
print(df["lon_min"].value_counts())
print(df["lon_max"].value_counts())

-34.869110    3035329
-32.984293    3035329
-32.041885    3035329
-35.811518    1517670
-33.926702    1517670
               ...   
-36.281964     183960
-33.490981     183960
-30.700015     183960
-30.696652     183960
-36.277805     183960
Name: lat_min, Length: 86, dtype: int64
-34.869110    3035329
-32.984293    3035329
-32.041885    3035329
-29.214660    2023560
-33.000000    1563660
               ...   
-27.909065     183960
-33.487232     183960
-30.696652     183960
-27.906064     183960
-30.700015     183960
Name: lat_max, Length: 89, dtype: int64
144.375000    2529713
151.875000    2529713
148.125000    2529713
140.625000    2391653
149.375000    1931573
               ...   
142.734375     230100
141.328125     230100
148.359375     230100
143.750000     183960
153.750000     138060
Name: lon_min, Length: 78, dtype: int64
144.375000    2529713
148.125000    2529713
151.875000    2529713
141.875000    1931573
150.625000    1931573
               ...   
146.953125     230100


In [22]:
# Approach2 - Change data type

In [23]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','lon_min','lon_max']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','lon_min','lon_max']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2000.44 MB
Memory usage with float32: 1000.22 MB


### 5.2 Compare run times on different machines within your team and summarize your observations.

Original data :
| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |                  |     |           |        |            |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |    39.3s        |
| Natalie    |                  |     |           |        |            |
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

Approach1 - Select just columns we use :
| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |                  |     |           |        |            |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     23.9s       |
| Natalie    |                  |     |           |        |            |
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |

Summary :

- xx
- xx
- xx

# Step6 : Perform Simple EDA in R

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Stephen    |                  |     |           |        |            |
| Nate    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |            |
| Natalie    |                  |     |           |        |            |
| Nikita    |    Windows              | 16GB    |  12th Gen Intel(R) Core(TM) i7-1255U  |  Yes   |            |