In [115]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [93]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

In [94]:
df_features = spark.read.csv("dengue_features_train.csv", header=True)
#df_features['month'] = df_features['week_start_date'][5:7]
df_features = df_features.drop('precipitation_amt_mm','week_start_date')

In [95]:
df_labels = spark.read.csv("dengue_labels_train.csv", header=True)


In [96]:
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

In [99]:

for col_name in df_train.columns:
    if col_name not in ['city']:
        df_train = df_train.withColumn(col_name, df_train[col_name].cast('float'))
df_train = df_train.dropna()

In [100]:
indexer = StringIndexer(inputCol='city', outputCol='city_')
df_train = indexer.fit(df_train).transform(df_train)

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')
df_train = encoder.transform(df_train)

In [101]:
lr_features = ['year', 'weekofyear', 'ndvi_ne', 'ndvi_nw', 'ndvi_se',
               'ndvi_sw','reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
            'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm', 
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm', 'cityVect']

In [102]:
vectorAssembler = VectorAssembler(inputCols=lr_features, outputCol = 'features')

In [103]:
df_train_vectorised = vectorAssembler.transform(df_train)
df_train_vectorised.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[1990.0,18.0,0.12...|
|[1990.0,19.0,0.16...|
|[1990.0,20.0,0.03...|
|[1990.0,21.0,0.12...|
|[1990.0,22.0,0.19...|
|[1990.0,24.0,0.11...|
|[1990.0,25.0,0.07...|
|[1990.0,26.0,0.10...|
|[1990.0,28.0,0.19...|
|[1990.0,29.0,0.29...|
+--------------------+
only showing top 10 rows



In [104]:
scaler = StandardScaler(inputCol='features', outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_train_vectorised)
df_train = scaler_model.transform(df_train_vectorised)

In [119]:
splits = df_train.randomSplit([0.75, 0.25])
train_df = splits[0]
test_df = splits[1]

In [107]:
lr = LinearRegression(featuresCol='scaled_features', labelCol='total_cases')

In [108]:
model_lr = lr.fit(train_df)

In [109]:
test_df = model_lr.transform(test_df)

In [110]:
test_df.select(['total_cases','prediction']).show(150)

+-----------+--------------------+
|total_cases|          prediction|
+-----------+--------------------+
|        0.0|   6.573863360729053|
|        0.0|   4.493926022235431|
|        0.0|   4.776195158121112|
|        1.0|  3.0849789830742758|
|        1.0|   5.937561869777575|
|        0.0|   9.885911717792286|
|        0.0|  25.509083241742033|
|        0.0|  20.817448281709503|
|        0.0|   5.842732919124536|
|        0.0|  3.1478864231767005|
|        0.0| -1.2409707570119544|
|        0.0| -0.8129613452059523|
|        0.0| 0.09587239138740955|
|        0.0|  5.9069486280872106|
|        0.0|  17.980440420836352|
|        0.0|   21.97157315586058|
|        0.0|   8.439627466836544|
|        1.0|   9.231121518592335|
|        0.0|  -4.758254880110517|
|        0.0|   4.885094912018268|
|        0.0|  3.5166695427551744|
|        0.0|   4.296730498068296|
|        0.0|  14.294515592965304|
|        1.0|   27.81042384624338|
|        1.0|  18.160321916563746|
|        4.0|   42.5

In [54]:
model_lr.summary.rootMeanSquaredError

41.9691519980088

In [114]:
model_lr.summary.r2


0.24919849722816678

In [112]:
evaluator = RegressionEvaluator(labelCol='total_cases', predictionCol='prediction',
                               metricName='mae')

In [113]:
evaluator.evaluate(test_df)

15.078247695333266

In [116]:
rf = RandomForestRegressor(featuresCol='scaled_features', labelCol='total_cases')

In [117]:
model_rf = rf.fit(train_df)

In [120]:
test_df_rf = model_rf.transform(test_df)

In [121]:
evaluator.evaluate(test_df_rf)

9.646417440133542