In [1]:
import pandas as pd
import os
import geopandas as gpd
import pyarrow as pa
from shapely import wkb

## Initial loading times

### pricePerProvince.csv

In [2]:
%%timeit -n 2 -r 3
price_df = pd.read_csv("../data/raw/pricePerProvince.csv", encoding='latin1').sort_values(by='price per ¢/kWh', ascending=False)

1.46 ms ± 608 µs per loop (mean ± std. dev. of 3 runs, 2 loops each)


In [3]:
price_df = pd.read_csv("../data/raw/pricePerProvince.csv", encoding='latin1').sort_values(by='price per ¢/kWh', ascending=False)

In [4]:
price_df.to_parquet('../data/processed/pricePerProvince.parquet')

In [5]:
%%timeit -n 2 -r 3
price_df = pd.read_parquet('../data/processed/pricePerProvince.parquet')

The slowest run took 4.47 times longer than the fastest. This could mean that an intermediate result is being cached.
4.96 ms ± 3.67 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


#### csv is better

### kWh_poly.json

In [6]:
%%timeit -n 2 -r 3
df1 = gpd.read_file('../data/processed/kWh_poly.json')
alt_data = df1.to_crs(epsg=4326)  
alt_data = alt_data.dropna(subset=['latitude', 'longitude'])

473 ms ± 11.1 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


In [7]:
df1 = gpd.read_file('../data/processed/kWh_poly.json')
alt_data = df1.to_crs(epsg=4326)  
alt_data = alt_data.dropna(subset=['latitude', 'longitude'])

In [8]:
alt_data.head(1)

Unnamed: 0,Province,Municipality,Month,South-facing with vertical (90 degrees) tilt,South-facing with latitude tilt,South-facing with tilt=latitude+15 degrees,South-facing with tilt=latitude-15 degrees,2-axis tracking,Horizontal (0 degree),address,latitude,longitude,polygons,geometry
0,Alberta,Acadia Valley,Annual,3.85,4.95,4.71,4.94,7.09,3.73,"Acadia Valley, Alberta",51.158676,-110.210332,"POLYGON ((-110.2141432 51.1587175, -110.211333...",POINT (-110.21033 51.15868)


In [9]:
alt_data.to_parquet('../data/processed/kWh_poly.parquet')

In [10]:
%%timeit -n 2 -r 3
alt1_data = pd.read_parquet('../data/processed/kWh_poly.parquet')
alt1_data['geometry'] = alt1_data['geometry'].apply(wkb.loads)
alt1_data = gpd.GeoDataFrame(alt1_data)

28.1 ms ± 3.42 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


In [11]:
alt1_data = pd.read_parquet('../data/processed/kWh_poly.parquet')
alt1_data['geometry'] = alt1_data['geometry'].apply(wkb.loads)
alt1_data = gpd.GeoDataFrame(alt1_data)
alt1_data.head(1)

Unnamed: 0,Province,Municipality,Month,South-facing with vertical (90 degrees) tilt,South-facing with latitude tilt,South-facing with tilt=latitude+15 degrees,South-facing with tilt=latitude-15 degrees,2-axis tracking,Horizontal (0 degree),address,latitude,longitude,polygons,geometry
0,Alberta,Acadia Valley,Annual,3.85,4.95,4.71,4.94,7.09,3.73,"Acadia Valley, Alberta",51.158676,-110.210332,"POLYGON ((-110.2141432 51.1587175, -110.211333...",POINT (-110.21033 51.15868)


#### parquet is better

### ne_50m_admin_1_states_provinces.shp

In [12]:
%%timeit -n 2 -r 3
gdf1 = gpd.read_file('../data/raw/ne_50m_admin_1_states_provinces/ne_50m_admin_1_states_provinces.shp')
gdf_ca = gdf1[gdf1['iso_a2'] == 'CA']
mapping = {
    'Yukon': 'Yukon Territory',
    'Québec': 'Quebec'
}
gdf_ca['name'] = gdf_ca['name'].replace(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

939 ms ± 67.3 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [13]:
gdf1 = gpd.read_file('../data/raw/ne_50m_admin_1_states_provinces/ne_50m_admin_1_states_provinces.shp')
gdf_ca = gdf1[gdf1['iso_a2'] == 'CA']
mapping = {
    'Yukon': 'Yukon Territory',
    'Québec': 'Quebec'
}
gdf_ca['name'] = gdf_ca['name'].replace(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [14]:
gdf_ca.to_parquet('../data/processed/ne_50m_admin_1_states_provinces.parquet')

In [15]:
%%timeit -n 2 -r 3
gdf_ca = gpd.read_parquet('../data/processed/ne_50m_admin_1_states_provinces.parquet')

13.8 ms ± 1.34 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


#### parquet is better

### panels.csv

In [16]:
%%timeit -n 2 -r 3
panel_df = pd.read_csv('../data/raw/panels.csv')

1.2 ms ± 485 µs per loop (mean ± std. dev. of 3 runs, 2 loops each)


In [17]:
panel_df = pd.read_csv('../data/raw/panels.csv')

In [18]:
panel_df.to_parquet('../data/processed/panels.parquet')

In [19]:
%%timeit -n 2 -r 3
panel_df = pd.read_parquet('../data/processed/panels.parquet')

3.48 ms ± 1.69 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)


#### csv is better