## RainFall Data Analysis

In [2]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

### Download Partitioned Data

In [2]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/rainfall/partitions/"


# Retrieve the article metadata
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want


# Zip to Folder Unzip
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])
        
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

### Combine Data

In [3]:
# Read the data
files_to_combine = glob.glob(output_directory + "*.csv")
files_to_combine.remove(output_directory + "observed_daily_rainfall_SYD.csv")
df = pd.concat(
    (pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'[^\/&\\]+(?=_daily_rainfall_NSW\.)', file)[0])
                for file in files_to_combine)
    )

# Save the combined data
data_path = "../data/rainfall/"
os.makedirs(data_path + "combined/", exist_ok=True)
df.to_csv(data_path + "combined/rainfall_data.csv")

### Loading Combined Data and Simple EDA

In [5]:
# Read the data
data = pd.read_csv("../data/rainfall/combined/rainfall_data.csv")

In [39]:
# Testing different loading methods for EDA timing
import sys
# size of 'data' in gigabytes
sys.getsizeof(data)/(1e9)

11.448609766

In [7]:
data.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

#### No data manipulation

In [50]:
%%timeit -n 3 -r 2 # run statement 3 times for 2 repetitions
data.value_counts()

data.describe()

data.nunique()

3min ± 3.58 s per loop (mean ± std. dev. of 2 runs, 3 loops each)


#### Modifying Data Types

In [52]:
# Changing data types
## Changed 'time' from `object` to `datetime.datetime`
data['time'] = pd.to_datetime(data['time'])
data[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']] = data[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32')
sys.getsizeof(data)/(1e9) #size of modified data in GB

5.963029062

In [53]:
%%timeit -n 3 -r 2 # run statement 3 times for 2 repetitions
data.value_counts()

data.describe()

data.nunique()

2min 31s ± 315 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


#### Loading Select Columns

In [9]:
# CLEAR MODIFIED DATAFRAME AND RELOAD
del data

data = pd.read_csv("../data/rainfall/combined/rainfall_data.csv")

In [5]:
# Using only necessary columns (time, rainfall, model), location columns can be loaded in later if needed

data = data[['time', 'rain (mm/day)', 'model']]
sys.getsizeof(data)/(1e9) # Very interesting to see here, the location columns account for only ~10% of the total object size

9.44963879

In [56]:
%%timeit -n 3 -r 2 # run statement 3 times for 2 repetitions
data.value_counts()

data.describe()

data.nunique()

1min 53s ± 274 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


# Perform EDA on Python

In [38]:
# convert time to datetime year only
data['time'] = pd.to_datetime(data['time'])
data['year'] = data['time'].dt.year
# group data by year and model
data_grouped = data.groupby(['year', 'model']).mean()
data_grouped.reset_index(inplace=True)

model_list = ['ACCESS-CM2', 'ACCESS-ESM1-5'] # list of models to plot
data_filtered = data_grouped[data_grouped['model'].isin(model_list)]
data_filtered.head()


Unnamed: 0,year,model,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889,ACCESS-CM2,-33.125,-31.875,146.25,148.125,1.398767
1,1889,ACCESS-ESM1-5,-33.125,-31.875,147.1875,149.0625,1.856297
27,1890,ACCESS-CM2,-33.125,-31.875,146.25,148.125,1.488712
28,1890,ACCESS-ESM1-5,-33.125,-31.875,147.1875,149.0625,1.667741
54,1891,ACCESS-CM2,-33.125,-31.875,146.25,148.125,2.198307


In [52]:
# plot the data and facet by model
import altair as alt
alt.data_transformers.disable_max_rows()
alt.Chart(data_filtered).mark_line().encode(
    x='year',
    y='rain (mm/day):Q',
    color='model:N'
).properties(
    width=400,
    height=200
).facet(
    row='model:N'
)


#### Conclusion and Discussion

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken for Each Type of Memory Reduction |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Sam   |    macOS Catalina      |16GB | 2.8 GHz 4 Core i7   |  Yes   | 3mins, 2.5mins, 1.9mins     |
| Member 2    |                  |     |           |        |            |
| Member 3    |                  |     |           |        |            |
| Member 4    |                  |     |           |        |            |

I attempted to show some plots but unless I loaded only parts of the dataset, they would not execute. In terms of memory space, changing the data types was the best at reducing the size of the object. This is due to the changing of the numerical data from float64 to float32, which is effectively half the space. The trade-off for doing this is that now our data is less precise than before.