In [56]:
# Load pyspark
import findspark

findspark.init()

from pyspark import SparkContext

from pyspark.sql import SparkSession

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml import Pipeline

In [57]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

In [58]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

In [59]:
path_to_data = "data/"

df_features = spark.read.csv(path_to_data + "dengue_features_train.csv",
                             header=True)
df_labels = spark.read.csv(path_to_data + "dengue_labels_train.csv",
                           header=True)

#df_features['month'] = df_features['week_start_date'][5:7]
df_features = df_features.drop('precipitation_amt_mm','week_start_date')
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

In [60]:
train, test = df_train.randomSplit([0.75, 0.25], seed=18)

In [61]:
for col_name in train.columns:
    if col_name not in ['city']:
        train = train.withColumn(col_name, train[col_name].cast('float'))
        test = test.withColumn(col_name, test[col_name].cast('float'))
        
train = train.dropna()
test = test.dropna()
train.cache()

DataFrame[city: string, year: float, weekofyear: float, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float]

In [62]:
indexer = StringIndexer(inputCol='city', outputCol='city_')

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')

vectorAssembler = VectorAssembler(
    inputCols=['year', 'weekofyear',
               'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
               'reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
               'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c', 
               'station_precip_mm', 'cityVect'], 
    outputCol = 'features')

scaler = StandardScaler(inputCol='features',
                        outputCol="scaled_features",
                        withStd=True, withMean=True)

lr = LinearRegression(featuresCol='scaled_features',
                      labelCol='total_cases',
                      predictionCol='lr_prediction')

rf = RandomForestRegressor(featuresCol='scaled_features',
                           labelCol='total_cases',
                           predictionCol='rf_prediction')

In [63]:
pipeline_rf = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler, rf])
#, encoder, vectorAssembler, scaler, rf

In [64]:
model_rf = pipeline_rf.fit(train)

In [65]:
test_prediction_rf = model_rf.transform(train)
test_prediction_rf

DataFrame[city: string, year: float, weekofyear: float, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float, city_: double, cityVect: vector, features: vector, scaled_features: vector, rf_prediction: double]

In [66]:
test_prediction_rf.select(['total_cases','rf_prediction']).show(150)

+-----------+------------------+
|total_cases|     rf_prediction|
+-----------+------------------+
|        0.0|3.1044375993455513|
|        0.0|2.9516836227431766|
|        0.0|3.7846634696385935|
|        0.0| 4.319174939916822|
|        0.0| 2.787560426294786|
|        0.0|2.8018461405805004|
|        0.0|   2.8143461405805|
|        0.0| 3.479984095757146|
|        1.0|  8.66600893229928|
|        0.0| 3.919722156840662|
|        0.0|  3.16790705508097|
|        0.0| 3.676733454183151|
|        1.0|3.5661736289888437|
|        0.0|  4.14977872921669|
|        0.0|17.024818315492617|
|        0.0| 5.869892057777035|
|        0.0| 4.851276095997932|
|        1.0|3.8232108465123007|
|        0.0|5.5307212407826345|
|        0.0| 8.215457284878864|
|        0.0| 5.057845622127251|
|        0.0| 5.462043608735363|
|        0.0|4.7337200155717385|
|        0.0| 5.403816496930424|
|        0.0| 4.042854274164816|
|        0.0| 6.725884849787967|
|        1.0|3.7087479441731377|
|        1

In [85]:
# TO DO: evaluate first result before grid (to make sure grid improves)
evaluator_rf.evaluate(test_prediction_rf)

17.28101139887985

In [68]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15, 20]) \
    .addGrid(rf.numTrees, [10, 20, 30, 40, 50]) \
    .build()

In [69]:
evaluator_rf = RegressionEvaluator(labelCol='total_cases',
                                   predictionCol='rf_prediction',
                                   metricName="rmse")


crossval_rf = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_rf,
                          numFolds=2,
                          seed=18)

In [70]:
cvModel_rf = crossval_rf.fit(train)

In [71]:
p_rf = cvModel_rf.transform(train)

In [73]:
evaluator_rf.evaluate(p_rf)

9.3388499046989

In [74]:
cvModel_rf.bestModel.stages[-1].getNumTrees

50

In [75]:
##### df_train = indexer.fit(df_train).transform(df_train)

#df_train = encoder.transform(df_train)

In [76]:
pipeline_lr = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler, lr])
model_lr = pipeline_lr.fit(train)


In [77]:
test_lr = model_lr.transform(test)

In [78]:
test_lr.select(['total_cases','lr_prediction']).show(150)

+-----------+--------------------+
|total_cases|       lr_prediction|
+-----------+--------------------+
|        0.0|   4.715201711042997|
|        0.0|  2.1373536371109587|
|        0.0|    9.95483978673221|
|        1.0|   3.650865708948402|
|        0.0|  23.627925466242644|
|        0.0|  21.480306326530663|
|        0.0|   16.53828011990681|
|        0.0|   12.38187311892863|
|        1.0|  11.490407609839519|
|        0.0|  3.4094519350615293|
|        0.0|   7.975528301465218|
|        1.0| -1.3155479906377145|
|        0.0|  2.6297046977631204|
|        0.0|  -4.599885533238595|
|        0.0|   6.018443110365089|
|        0.0|  18.503442976228932|
|        0.0|    27.0973727552473|
|        0.0|  18.867395074926073|
|        1.0|  15.256442151623148|
|        1.0|   34.23933935571746|
|        0.0|-0.41639893876753575|
|        4.0|  10.779540770098343|
|        5.0|  13.444790495402346|
|        8.0|  16.285656272279883|
|       10.0|  10.299582618867873|
|        6.0|  1.664

In [84]:
evaluator_lr.evaluate(test_lr)

24.31189206295404

In [79]:
evaluator_lr = RegressionEvaluator(labelCol='total_cases',
                                   predictionCol='lr_prediction',
                                   metricName="rmse")


crossval_lr = CrossValidator(estimator=pipeline_lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_lr,
                          numFolds=2,
                          seed=18)

In [80]:
cvModel_lr = crossval_lr.fit(train)

In [81]:
p_lr = cvModel_lr.transform(train)

In [83]:
evaluator_lr.evaluate(p_lr)

27.62073809400571