

https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/

https://www.cdc.gov/dengue/

In [1]:
import findspark
findspark.init()

import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

### Load the data

In [3]:
path_to_data = "data/"


df_features = spark.read.csv(path_to_data + "dengue_features_train.csv", header=True)
#df_features['month'] = df_features['week_start_date'][5:7]

df_labels = spark.read.csv(path_to_data + "dengue_labels_train.csv", header=True)

In [4]:
df_features = df_features.drop('precipitation_amt_mm', 'week_start_date')

In [5]:
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

In [6]:
for col_name in df_train.columns:
    if col_name not in ['city']:
        df_train = df_train.withColumn(col_name, df_train[col_name].cast('float'))

df_train = df_train.dropna()

In [7]:
#print("size of the data: {}".format(df_train.shape()))

df_train.show()

+----+------+----------+---------+---------+---------+---------+---------------------+---------------------+---------------------------+-------------------------+-------------------------+-------------------------------+------------------------------------+----------------------------+-------------------------------------+-----------------+------------------+-----------------------+------------------+------------------+-----------------+-----------+
|city|  year|weekofyear|  ndvi_ne|  ndvi_nw|  ndvi_se|  ndvi_sw|reanalysis_air_temp_k|reanalysis_avg_temp_k|reanalysis_dew_point_temp_k|reanalysis_max_air_temp_k|reanalysis_min_air_temp_k|reanalysis_precip_amt_kg_per_m2|reanalysis_relative_humidity_percent|reanalysis_sat_precip_amt_mm|reanalysis_specific_humidity_g_per_kg|reanalysis_tdtr_k|station_avg_temp_c|station_diur_temp_rng_c|station_max_temp_c|station_min_temp_c|station_precip_mm|total_cases|
+----+------+----------+---------+---------+---------+---------+---------------------+------

### Prepare data

In [8]:
indexer = StringIndexer(inputCol='city', outputCol='city_')
df_train = indexer.fit(df_train).transform(df_train)

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')
df_train = encoder.transform(df_train)

In [9]:
lr_features = ['year', 'weekofyear',
               'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
               'reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
               'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm', 
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c',
               'station_precip_mm', 'cityVect']

In [10]:
vectorAssembler = VectorAssembler(inputCols=lr_features, outputCol = 'features')

In [11]:
df_train_vectorised = vectorAssembler.transform(df_train)
df_train_vectorised.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[1990.0,18.0,0.12...|
|[1990.0,19.0,0.16...|
|[1990.0,20.0,0.03...|
|[1990.0,21.0,0.12...|
|[1990.0,22.0,0.19...|
|[1990.0,24.0,0.11...|
|[1990.0,25.0,0.07...|
|[1990.0,26.0,0.10...|
|[1990.0,28.0,0.19...|
|[1990.0,29.0,0.29...|
+--------------------+
only showing top 10 rows



In [12]:
scaler = StandardScaler(inputCol='features', outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_train_vectorised)
df_train = scaler_model.transform(df_train_vectorised)

In [13]:
splits = df_train.randomSplit([0.75, 0.25])
train_df = splits[0]
test_df = splits[1]

In [14]:
lr = LinearRegression(featuresCol='scaled_features', labelCol='total_cases')

In [15]:
model_lr = lr.fit(train_df)

In [16]:
test_df = model_lr.transform(test_df)

In [17]:
test_df.select(['total_cases','prediction']).show(150)

+-----------+-------------------+
|total_cases|         prediction|
+-----------+-------------------+
|        0.0| 13.392213062635516|
|        0.0| 1.1559400817170307|
|        0.0|  1.939261090943095|
|        0.0| 1.7071342809228902|
|        0.0|   4.01719153155026|
|        1.0| 13.670067242363636|
|        0.0|  5.635427612681337|
|        1.0| 3.6606379234588786|
|        0.0|  19.22148668128794|
|        0.0| 20.897958686321637|
|        0.0|     2.937558174126|
|        0.0| 0.8856696657624497|
|        0.0|  7.983434209606287|
|        0.0| 10.691407598940733|
|        0.0|  4.321649511973622|
|        0.0| 0.2883016401596983|
|        0.0| -4.347725047282044|
|        0.0|  4.085940816247838|
|        0.0|  6.287071604941065|
|        0.0| 23.642298261411142|
|        4.0| 33.764857514679704|
|        1.0|   32.0463842535656|
|        4.0|  8.789853081798682|
|       23.0|  10.53843972558398|
|        7.0|  8.094874173132037|
|        4.0|  4.265535768521698|
|        6.0| 

In [21]:
rmse = model_lr.summary.rootMeanSquaredError
r2 = model_lr.summary.r2

print("rmse = {:.3f} / r2 = {:.3f}".format(rmse, r2))

rmse = 26.461 / r2 = 0.234


In [22]:
evaluator = RegressionEvaluator(labelCol='total_cases', predictionCol='prediction',
                               metricName='mae')

In [23]:
evaluator.evaluate(test_df)

16.007555559001958

In [24]:
rf = RandomForestRegressor(featuresCol='scaled_features', labelCol='total_cases')

In [25]:
model_rf = rf.fit(train_df)

In [26]:
test_df_rf = model_rf.transform(test_df)

IllegalArgumentException: 'requirement failed: Column prediction already exists.'

In [None]:
evaluator.evaluate(test_df_rf)