In [4]:
sqlContext

<pyspark.sql.context.SQLContext at 0x7fac2c447590>

In [None]:
parquet = sqlContext.read.parquet(
    "hdfs://192.168.33.20/home/ubuntu/day_BCSD_historical_r1i1p1_ACCESS1-0_1997.parquet")
parquet.registerTempTable("parquet")


In [None]:
parquet.show()

In [3]:
parquet_count = parquet.count()
parquet_count

376609324

### Create Dataframe
Pull out 'month' and 'year' from timestamp and make them available as columns.  Also exclude any datapoints that do not have valid values for pr, tasmin or tasmax.

In [None]:
sql = """
SELECT lat, lon, time, model, pr, tasmin, tasmax, MONTH(from_unixtime(time)) as month, YEAR(from_unixtime(time)) as year
FROM parquet
WHERE pr < 1.0E20 AND tasmin < 1.0E20 AND tasmax < 1.0E20
"""
df = sqlContext.sql(sql)

In [None]:
df.show()

In [None]:
df_count = df.count()
df_count

Percent of cells excluded due to missing values

In [None]:
((df_count - parquet_count) / float(parquet_count)) * 100

Make sure we've actually removed missing values (encoded as 1.0E20)

In [None]:
df.select("pr", "tasmin", "tasmax").agg({"pr": "max", 
                                     "tasmin": "max", 
                                     "tasmax": "max"}).show()

## Windowing Code

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType

def _k_to_f(k):
    return ((k - 273.15) * 1.8) + 32.0

k_to_f = udf(_k_to_f, FloatType())

In [None]:
df.withColumn('f_tasmin', k_to_f(df.tasmin)).show()

In [None]:
import os
print '\n'.join(k + ": " + v for k, v in os.environ.items())

### Winker scale code

Growing region is assumed to be April 1st through October 31st in the Northern Hemisphere,  and October 1st through April 30th in the Southern Hemisphere

In [None]:
grow_season = df.select("*")\
              .where(((df.lat >= 0.0) & (df.month >= 4) & (df.month <= 10)) |
                     ((df.lat < 0.0) & (df.month <= 4) & (df.month >= 10)))

In [None]:
gs_count = grow_season.count()
gs_count

Percent decrease of data due to filtering on grow season

In [None]:
((gs_count - df_count ) / float(df_count)) * 100

In [None]:
grow_season.persist()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType

def _k_to_f(k):
    return ((k - 273.15) * 1.8) + 32.0

def _avg_tmp_f(_min, _max):
    return (_k_to_f(_min) + _k_to_f(_max)) / 2.0

avg_tmp_f = udf(_avg_tmp_f, FloatType())

In [None]:
grow_season = grow_season.withColumn('f_tasavg', avg_tmp_f(df.tasmin, df.tasmax))
grow_season.show()

In [None]:
grow_season.where(grow_season.f_tasavg > 50.0).count()

In [None]:
# https://en.wikipedia.org/wiki/Winkler_scale
def _winkler_scale(d):
    if d <= 2500:
        return 1
    elif d >= 2501 and d <= 3000:
        return 2
    elif d >= 3001 and d <= 3500:
        return 3
    elif d >= 3501 and d <= 4000:
        return 4
    elif d > 4000:
        return 5

def _degree_days(temp):
    dd = int(temp - 50.0)
    return 0 if dd <= 0 else dd

degree_days = udf(_degree_days, IntegerType())
winkler_scale = udf(_winkler_scale, IntegerType())

### Calculate the degree days
Group By year, latitude, longitude and sum the calculated dgree days

In [None]:
dd = grow_season.withColumn("degree_days", degree_days(grow_season.f_tasavg))\
        .groupBy(df.year, df.lat, df.lon).agg({"degree_days": "sum"})\
        .withColumnRenamed("sum(degree_days)", "degree_days")

Exclude locations with less than 1 degree day

In [None]:
DEGREE_DAY_THRESHOLD = 1
dd = dd.where(dd.degree_days >= DEGREE_DAY_THRESHOLD)

In [None]:
dd.show()

In [None]:
dd.count()

In [None]:
pdd = dd.withColumn("winkler", winkler_scale(dd.degree_days)).toPandas()

In [None]:
len(pdd)

In [None]:
pdd

In [None]:
pdd.groupby("winkler").size()

In [None]:
pdd.to_csv("/home/ubuntu/winkler_scale_IPSL-CM5A-LR_1997.csv")