In [1]:
# Load pyspark
import findspark

findspark.init()

from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml import Pipeline

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

In [3]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

### Chargement des données

In [34]:
path_to_data = "data/"

df_features = spark.read.csv(path_to_data + "dengue_features_train.csv",
                             header=True)
df_labels = spark.read.csv(path_to_data + "dengue_labels_train.csv",
                           header=True)

In [35]:
df_features.printSchema()

root
 |-- city: string (nullable = true)
 |-- year: string (nullable = true)
 |-- weekofyear: string (nullable = true)
 |-- week_start_date: string (nullable = true)
 |-- ndvi_ne: string (nullable = true)
 |-- ndvi_nw: string (nullable = true)
 |-- ndvi_se: string (nullable = true)
 |-- ndvi_sw: string (nullable = true)
 |-- precipitation_amt_mm: string (nullable = true)
 |-- reanalysis_air_temp_k: string (nullable = true)
 |-- reanalysis_avg_temp_k: string (nullable = true)
 |-- reanalysis_dew_point_temp_k: string (nullable = true)
 |-- reanalysis_max_air_temp_k: string (nullable = true)
 |-- reanalysis_min_air_temp_k: string (nullable = true)
 |-- reanalysis_precip_amt_kg_per_m2: string (nullable = true)
 |-- reanalysis_relative_humidity_percent: string (nullable = true)
 |-- reanalysis_sat_precip_amt_mm: string (nullable = true)
 |-- reanalysis_specific_humidity_g_per_kg: string (nullable = true)
 |-- reanalysis_tdtr_k: string (nullable = true)
 |-- station_avg_temp_c: string (nulla

In [36]:
df_labels.printSchema()

root
 |-- city: string (nullable = true)
 |-- year: string (nullable = true)
 |-- weekofyear: string (nullable = true)
 |-- total_cases: string (nullable = true)



### Jointure

Nous joignons les deux DataFrames

In [37]:
#join
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

print("df_train = ({}, {})".format(df_train.count(), len(df_train.columns)))

df_train = (1456, 25)


### Nettoyage

In [38]:
# The 2 columns 'precipitation_amt_mm' and 'reanalysis_sat_precip_amt_mm' are the same
# we drop 'precipitation_amt_mm'
df_train = df_train.drop('precipitation_amt_mm')

# recast 'week_start_date' as a date. Nice to have for plotting or time series analysis
df_train = df_train.withColumn('week_start_date', F.to_date('week_start_date', 'yyyy-MM-dd'))

# recast 'year' and 'weekofyear' to integer
df_train = df_train \
    .withColumn('year', df_train['year'].cast('int')) \
    .withColumn('weekofyear', df_train['weekofyear'].cast('int'))

# cast column to float
for col_name in df_train.columns[4:]:
    df_train = df_train.withColumn(col_name, df_train[col_name].cast('float'))

In [39]:
print("df_train = ({}, {})".format(df_train.count(), len(df_train.columns)))

# identify null value
for col_name in df_train.columns:
    print("{} => {}".format(col_name,
                            df_train.filter(F.isnull(df_train[col_name])).count()))
    
df_train = df_train.dropna()

print("df_train = ({}, {})".format(df_train.count(), len(df_train.columns)))

df_train = (1456, 24)
city => 0
year => 0
weekofyear => 0
week_start_date => 0
ndvi_ne => 194
ndvi_nw => 52
ndvi_se => 22
ndvi_sw => 22
reanalysis_air_temp_k => 10
reanalysis_avg_temp_k => 10
reanalysis_dew_point_temp_k => 10
reanalysis_max_air_temp_k => 10
reanalysis_min_air_temp_k => 10
reanalysis_precip_amt_kg_per_m2 => 10
reanalysis_relative_humidity_percent => 10
reanalysis_sat_precip_amt_mm => 13
reanalysis_specific_humidity_g_per_kg => 10
reanalysis_tdtr_k => 10
station_avg_temp_c => 43
station_diur_temp_rng_c => 43
station_max_temp_c => 20
station_min_temp_c => 14
station_precip_mm => 22
total_cases => 0
df_train = (1199, 24)


In [40]:
df_train.printSchema()

root
 |-- city: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- week_start_date: date (nullable = true)
 |-- ndvi_ne: float (nullable = true)
 |-- ndvi_nw: float (nullable = true)
 |-- ndvi_se: float (nullable = true)
 |-- ndvi_sw: float (nullable = true)
 |-- reanalysis_air_temp_k: float (nullable = true)
 |-- reanalysis_avg_temp_k: float (nullable = true)
 |-- reanalysis_dew_point_temp_k: float (nullable = true)
 |-- reanalysis_max_air_temp_k: float (nullable = true)
 |-- reanalysis_min_air_temp_k: float (nullable = true)
 |-- reanalysis_precip_amt_kg_per_m2: float (nullable = true)
 |-- reanalysis_relative_humidity_percent: float (nullable = true)
 |-- reanalysis_sat_precip_amt_mm: float (nullable = true)
 |-- reanalysis_specific_humidity_g_per_kg: float (nullable = true)
 |-- reanalysis_tdtr_k: float (nullable = true)
 |-- station_avg_temp_c: float (nullable = true)
 |-- station_diur_temp_rng_c: float (nullable = true)
 

### Construction des jeux d'entrainement et de test

In [41]:
train, test = df_train.randomSplit([0.75, 0.25], seed=18)

train.cache()

DataFrame[city: string, year: int, weekofyear: int, week_start_date: date, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float]

### Construction du *pipeline*

In [42]:
indexer = StringIndexer(inputCol='city', outputCol='city_')

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')

vectorAssembler = VectorAssembler(
    inputCols=['year', 'weekofyear',
               'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
               'reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
               'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c', 
               'station_precip_mm', 'cityVect'], 
    outputCol = 'features')

scaler = StandardScaler(inputCol='features',
                        outputCol="scaled_features",
                        withStd=True, withMean=True)

lr = LinearRegression(featuresCol='scaled_features',
                      labelCol='total_cases',
                      predictionCol='lr_prediction')

rf = RandomForestRegressor(featuresCol='scaled_features',
                           labelCol='total_cases',
                           predictionCol='rf_prediction')

#### 1. Random Forest

In [43]:
pipeline_rf = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler, rf])

evaluator_rf = RegressionEvaluator(labelCol='total_cases',
                                   predictionCol='rf_prediction',
                                   metricName="rmse")

In [44]:
# build model
model_rf = pipeline_rf.fit(train)

In [45]:
test_prediction_rf = model_rf.transform(train)
test_prediction_rf

DataFrame[city: string, year: int, weekofyear: int, week_start_date: date, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float, city_: double, cityVect: vector, features: vector, scaled_features: vector, rf_prediction: double]

In [46]:
test_prediction_rf.select(['total_cases','rf_prediction']).show(150)

+-----------+------------------+
|total_cases|     rf_prediction|
+-----------+------------------+
|        0.0| 5.264551587796776|
|        0.0|3.2913081284475445|
|        0.0|3.7729546854714835|
|        0.0| 3.466239752379168|
|        0.0| 3.731886380629278|
|        0.0|3.1488017099623065|
|        0.0| 3.022430995356298|
|        0.0| 4.617970782803445|
|        1.0| 5.631407409098301|
|        0.0| 2.966744293694364|
|        0.0|3.9618876511058074|
|        0.0| 3.982900812294008|
|        1.0| 4.635700908420374|
|        0.0| 4.820125742112186|
|        0.0|12.204522620720086|
|        0.0| 7.054958310132896|
|        0.0| 7.506966359711872|
|        1.0| 4.543887285311992|
|        0.0| 8.626135385485544|
|        0.0| 11.38622611196796|
|        0.0| 5.240076254588742|
|        0.0| 7.417985050468241|
|        0.0|5.8027156500977135|
|        0.0| 4.965229669132326|
|        0.0| 4.965229669132326|
|        0.0| 7.294869323416513|
|        0.0| 4.339889852493575|
|        0

In [47]:
rmse = evaluator_rf.evaluate(test_prediction_rf)

print("Random Forest::Before grid search - RMSE = {:.3f}".format(rmse))

Random Forest::Before grid search - RMSE = 17.774


#### Cross-Validation

In [48]:
# Parameter grid for random forest
paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15, 20]) \
    .addGrid(rf.numTrees, [10, 20, 30, 40, 50]) \
    .build()

crossval_rf = CrossValidator(estimator=pipeline_rf,
                             estimatorParamMaps=paramGrid_rf,
                             evaluator=evaluator_rf,
                             numFolds=2,
                             seed=18)

In [49]:
cvModel_rf = crossval_rf.fit(train)

In [50]:
p_rf = cvModel_rf.transform(train)

In [51]:
rmse = evaluator_rf.evaluate(p_rf)

print("Random Forest::After grid search - RMSE = {:.3f}".format(rmse))

Random Forest::After grid search - RMSE = 9.001


In [52]:
cvModel_rf.bestModel.stages[-1].getNumTrees

40

#### 2. Linear regression

In [53]:
pipeline_lr = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler, lr])

evaluator_lr = RegressionEvaluator(labelCol='total_cases',
                                   predictionCol='lr_prediction',
                                   metricName="rmse")

In [54]:
model_lr = pipeline_lr.fit(train)

In [55]:
test_lr = model_lr.transform(test)

In [56]:
test_lr.select(['total_cases','lr_prediction']).show(10)

+-----------+-------------------+
|total_cases|      lr_prediction|
+-----------+-------------------+
|        0.0|  5.309805224413651|
|        0.0| 2.1296964967743612|
|        0.0|   9.46266650831906|
|        1.0| 3.9790872846221212|
|        0.0|  23.61227055126312|
|        0.0| 23.992805141601696|
|        0.0|-0.8077976913743257|
|        0.0| -2.812901654584966|
|        0.0|  7.502128494561031|
|        0.0|  6.336783396109944|
+-----------+-------------------+
only showing top 10 rows



In [57]:
rmse = evaluator_lr.evaluate(test_lr)

print("Linear regression::Before grid search - RMSE = {:.3f}".format(rmse))

Linear regression::Before grid search - RMSE = 24.514


#### Cross-Validation

In [58]:
# Parameter grid for linear regression (LR)
paramGrid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

crossval_lr = CrossValidator(estimator=pipeline_lr,
                             estimatorParamMaps=paramGrid_lr,
                             evaluator=evaluator_lr,
                             numFolds=2,
                             seed=18)

In [59]:
cvModel_lr = crossval_lr.fit(train)

In [60]:
p_lr = cvModel_lr.transform(train)

In [61]:
rmse = evaluator_lr.evaluate(p_lr)

print("Linear regression::After grid search - RMSE = {:.3f}".format(rmse))

Linear regression::After grid search - RMSE = 27.719
