

https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/

https://www.cdc.gov/dengue/

### Goal

Predict the number of dengue cases each week (in each location) based on environmental variables describing changes in temperature, precipitation, vegetation, and more.

In [1]:
import findspark
findspark.init()

import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

### Load the data

In [3]:
path_to_data = "data/"


df_features = spark.read.csv(path_to_data + "dengue_features_train.csv", header=True)
#df_features['month'] = df_features['week_start_date'][5:7]

df_labels = spark.read.csv(path_to_data + "dengue_labels_train.csv", header=True)

In [4]:
df_features = df_features.drop('precipitation_amt_mm', 'week_start_date')

In [5]:
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

In [6]:
for col_name in df_train.columns:
    if col_name not in ['city']:
        df_train = df_train.withColumn(col_name, df_train[col_name].cast('float'))

df_train = df_train.dropna()

In [7]:
#print("size of the data: {}".format(df_train.shape()))

df_train.show()

+----+------+----------+---------+---------+---------+---------+---------------------+---------------------+---------------------------+-------------------------+-------------------------+-------------------------------+------------------------------------+----------------------------+-------------------------------------+-----------------+------------------+-----------------------+------------------+------------------+-----------------+-----------+
|city|  year|weekofyear|  ndvi_ne|  ndvi_nw|  ndvi_se|  ndvi_sw|reanalysis_air_temp_k|reanalysis_avg_temp_k|reanalysis_dew_point_temp_k|reanalysis_max_air_temp_k|reanalysis_min_air_temp_k|reanalysis_precip_amt_kg_per_m2|reanalysis_relative_humidity_percent|reanalysis_sat_precip_amt_mm|reanalysis_specific_humidity_g_per_kg|reanalysis_tdtr_k|station_avg_temp_c|station_diur_temp_rng_c|station_max_temp_c|station_min_temp_c|station_precip_mm|total_cases|
+----+------+----------+---------+---------+---------+---------+---------------------+------

### Prepare data

In [8]:
indexer = StringIndexer(inputCol='city', outputCol='city_')
df_train = indexer.fit(df_train).transform(df_train)

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')
df_train = encoder.transform(df_train)

In [9]:
lr_features = ['year', 'weekofyear',
               'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
               'reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
               'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm', 
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c',
               'station_precip_mm', 'cityVect']

In [10]:
vectorAssembler = VectorAssembler(inputCols=lr_features, outputCol = 'features')

In [11]:
df_train_vectorised = vectorAssembler.transform(df_train)
df_train_vectorised.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[1990.0,18.0,0.12...|
|[1990.0,19.0,0.16...|
|[1990.0,20.0,0.03...|
|[1990.0,21.0,0.12...|
|[1990.0,22.0,0.19...|
|[1990.0,24.0,0.11...|
|[1990.0,25.0,0.07...|
|[1990.0,26.0,0.10...|
|[1990.0,28.0,0.19...|
|[1990.0,29.0,0.29...|
+--------------------+
only showing top 10 rows



In [12]:
scaler = StandardScaler(inputCol='features', outputCol="scaled_features",
                        withStd=True, withMean=True)

scaler_model = scaler.fit(df_train_vectorised)

df_train = scaler_model.transform(df_train_vectorised)

In [16]:
train, test = df_train.randomSplit([0.8, 0.2], seed=42)

In [17]:
lr = LinearRegression(featuresCol='scaled_features',
                      labelCol='total_cases')

In [18]:
model_lr = lr.fit(train)

In [20]:
pred_lr = model_lr.transform(test)

In [22]:
pred_lr.select(['total_cases','prediction']).show(150)

+-----------+--------------------+
|total_cases|          prediction|
+-----------+--------------------+
|        0.0|   6.030431526858896|
|        0.0|   13.61870596353615|
|        0.0|   0.686327739799399|
|        0.0|   9.005785253557189|
|        1.0|  1.8895961110443125|
|        0.0|   9.224592653304002|
|        0.0| -1.7899907901514638|
|        0.0|   5.356461604192749|
|        0.0|  -5.319194196427752|
|        0.0|   4.354922355980015|
|        0.0|   6.400502280258307|
|       16.0|  1.9361692674875002|
|       10.0|  16.046465116206647|
|       10.0|     17.718635388772|
|        4.0|   5.615116723911779|
|        5.0| -2.2837708111909585|
|        0.0|  25.433327360697934|
|        1.0|   6.572743171989659|
|        1.0|  0.5675129046281917|
|        1.0|   6.989582962190848|
|        0.0| -2.9075975638428737|
|        2.0|   7.241111595167787|
|        5.0|   24.86825683072162|
|        8.0|  13.240566683100967|
|        3.0|  -5.928593578295011|
|        3.0|  2.736

In [21]:
rmse = model_lr.summary.rootMeanSquaredError
r2 = model_lr.summary.r2

print("rmse = {:.3f} / r2 = {:.3f}".format(rmse, r2))

rmse = 26.461 / r2 = 0.234


In [23]:
evaluator = RegressionEvaluator(labelCol='total_cases',
                                predictionCol='prediction',
                                metricName='mae')

mae_lr = evaluator.evaluate(pred_lr)

print("mae = {:.3f}".format(mae_lr))

mae = 16.286


### Random forest

In [24]:
rf = RandomForestRegressor(featuresCol='scaled_features', labelCol='total_cases')

In [25]:
model_rf = rf.fit(train)

In [26]:
pred_rf = model_rf.transform(test)

In [28]:
mae_rf = evaluator.evaluate(pred_rf)

print("mae = {:.3f}".format(mae_rf))

mae = 13.349
