### Downloading the data

In [None]:
import os
import urllib.request

data_dir = "./data/weather/"
if not os.path.exists(data_dir):
    print("creating weather directory")
    os.system("mkdir -p ./data/weather")

# download weather observations
base_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/"
years = list(range(2018, 2020))
for year in years:
    fn = str(year) + ".csv.gz"
    if not os.path.isfile(data_dir + fn):
        print(f"Downloading {base_url+fn} to {data_dir+fn}")
        urllib.request.urlretrieve(base_url + fn, data_dir + fn)

# download weather station metadata
station_meta_url = (
    "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
)
if not os.path.isfile(data_dir + "ghcnd-stations.txt"):
    print("Downloading station meta..")
    urllib.request.urlretrieve(
        station_meta_url, data_dir + "ghcnd-stations.txt"
    )

### Loading into cudf


In [None]:
import cudf
import cupy as cp
import pycuda.autoprimaryctx

column_names = [
    "station_id", "date",
    "type", "val",
    "m_flag", "q_flag",
    "s_flag", "obs_time"
]
usecols = column_names[0:4]
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

In [None]:
weather_df.tail()

### Using a grid stride pattern to double all rainfall measurements

In [None]:
rainfall_df = weather_df[weather_df["type"] == "PRCP"]

In [None]:
rainfall_df.tail()

In [None]:
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule

mod = SourceModule("""
    __global__ void doublify(int64_t *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] *= 2;
        }
      }
    }
    """)
func = mod.get_function("doublify")

In [None]:
size = cp.int32(len(rainfall_df['val']))

func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [None]:
rainfall_df.tail()

### Let's try a more complex operation, converting the measurements to inches

In [None]:
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

# cast val to float
rainfall_df = weather_df[weather_df["type"] == "PRCP"].astype({'val': 'float64'})

In [None]:
rainfall_df.tail()

In [None]:
mod2 = SourceModule("""
    static constexpr float mm_to_inches_factor = 0.0393701;

    __global__ void mm_to_inches(double *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] = a[i] * mm_to_inches_factor * 0.1;
        }
      }
    }
    """)
func = mod2.get_function("mm_to_inches")

In [None]:
func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [None]:
rainfall_df.tail()

### Integration with external libraries, generating a column of random values

In [None]:
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

In [3]:
size = cp.int32(len(weather_df))

In [4]:
mod3 = SourceModule('''
#include <thrust/random.h>

extern "C" {

    __global__ void random_column(double* a, int N)
    {
      thrust::default_random_engine rng;
      
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          rng.discard(i);
          thrust::uniform_real_distribution<float> rand01(0,1);
          double r = rand01(rng);
          a[i] = r;
        }
      }
    }
}

''', no_extern_c=True)

In [5]:
func = mod3.get_function("random_column")

In [7]:
weather_df['random_col'] = cp.zeros(len(weather_df))

In [8]:
func(weather_df['random_col'], size, block=(256,1,1), grid=(4096,))

In [10]:
weather_df

Unnamed: 0,station_id,date,type,val,random_col
0,AE000041196,20180101,TMAX,259,0.000022
1,AE000041196,20180101,TMIN,112,0.085032
2,AE000041196,20180101,TAVG,186,0.601353
3,AEM00041194,20180101,TMAX,250,0.891611
4,AEM00041194,20180101,PRCP,0,0.967956
...,...,...,...,...,...
35117206,WZ004455110,20181231,TAVG,244,0.895549
35117207,ZI000067775,20181231,TMAX,285,0.610949
35117208,ZI000067775,20181231,TMIN,166,0.600397
35117209,ZI000067775,20181231,PRCP,0,0.592128
