In [1]:
# Load pyspark
import findspark

findspark.init()

from pyspark import SparkContext

from pyspark.sql import SparkSession

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml import Pipeline

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

In [3]:
spark = SparkSession.builder.appName("dengue").getOrCreate()

In [4]:
path_to_data = "data/"

df_features = spark.read.csv(path_to_data + "dengue_features_train.csv",
                             header=True)
df_labels = spark.read.csv(path_to_data + "dengue_labels_train.csv",
                           header=True)

#df_features['month'] = df_features['week_start_date'][5:7]
df_features = df_features.drop('precipitation_amt_mm','week_start_date')
df_train = df_features.join(df_labels, ['city', 'year', 'weekofyear'])

In [5]:
train, test = df_train.randomSplit([0.75, 0.25], seed=18)

In [6]:
for col_name in train.columns:
    if col_name not in ['city']:
        train = train.withColumn(col_name, train[col_name].cast('float'))
        test = test.withColumn(col_name, test[col_name].cast('float'))
        
train = train.dropna()
test = test.dropna()
train.cache()

DataFrame[city: string, year: float, weekofyear: float, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float]

In [7]:
indexer = StringIndexer(inputCol='city', outputCol='city_')

encoder = OneHotEncoder(inputCol='city_', outputCol='cityVect')

vectorAssembler = VectorAssembler(
    inputCols=['year', 'weekofyear',
               'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
               'reanalysis_air_temp_k','reanalysis_avg_temp_k',
               'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
               'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
               'station_avg_temp_c','station_diur_temp_rng_c',
               'station_max_temp_c', 'station_min_temp_c', 
               'station_precip_mm', 'cityVect'], 
    outputCol = 'features')

scaler = StandardScaler(inputCol='features',
                        outputCol="scaled_features",
                        withStd=True, withMean=True)

lr = LinearRegression(featuresCol='scaled_features',
                      labelCol='total_cases',
                      predictionCol='lr_prediction')

rf = RandomForestRegressor(featuresCol='scaled_features',
                           labelCol='total_cases',
                           predictionCol='rf_prediction')

In [8]:
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler, rf])
#, encoder, vectorAssembler, scaler, rf

In [9]:
model_pipeline = pipeline.fit(train)

In [10]:
test_prediction = model_pipeline.transform(train)
test_prediction

DataFrame[city: string, year: float, weekofyear: float, ndvi_ne: float, ndvi_nw: float, ndvi_se: float, ndvi_sw: float, reanalysis_air_temp_k: float, reanalysis_avg_temp_k: float, reanalysis_dew_point_temp_k: float, reanalysis_max_air_temp_k: float, reanalysis_min_air_temp_k: float, reanalysis_precip_amt_kg_per_m2: float, reanalysis_relative_humidity_percent: float, reanalysis_sat_precip_amt_mm: float, reanalysis_specific_humidity_g_per_kg: float, reanalysis_tdtr_k: float, station_avg_temp_c: float, station_diur_temp_rng_c: float, station_max_temp_c: float, station_min_temp_c: float, station_precip_mm: float, total_cases: float, city_: double, cityVect: vector, features: vector, scaled_features: vector, rf_prediction: double]

In [11]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15, 20]) \
    .addGrid(rf.numTrees, [10, 20, 30, 40, 50]) \
    .build()

In [14]:
rf_evaluator = RegressionEvaluator(labelCol='total_cases',
                                   predictionCol='prediction',
                                   metricName="rmse")


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=rf_evaluator,
                          numFolds=2,
                          seed=18)

In [15]:
cvModel = crossval.fit(train)

IllegalArgumentException: 'Field "prediction" does not exist.\nAvailable fields: city, year, weekofyear, ndvi_ne, ndvi_nw, ndvi_se, ndvi_sw, reanalysis_air_temp_k, reanalysis_avg_temp_k, reanalysis_dew_point_temp_k, reanalysis_max_air_temp_k, reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2, reanalysis_relative_humidity_percent, reanalysis_sat_precip_amt_mm, reanalysis_specific_humidity_g_per_kg, reanalysis_tdtr_k, station_avg_temp_c, station_diur_temp_rng_c, station_max_temp_c, station_min_temp_c, station_precip_mm, total_cases, CrossValidator_40a7b7b35a13a77cd933_rand, city_, cityVect, features, scaled_features, rf_prediction'

In [None]:
p = cvModel.transform(train)

In [None]:
rf_evaluator.evaluate(p)

In [None]:
cvModel.bestModel.stages[-1].getNumTrees

In [None]:
##### df_train = indexer.fit(df_train).transform(df_train)

#df_train = encoder.transform(df_train)

In [None]:
model_lr = lr.fit(train_df)

In [None]:
test_df = model_lr.transform(test_df)

In [None]:
test_df.select(['total_cases','lr_prediction']).show(150)

In [None]:
model_lr.summary.rootMeanSquaredError

In [None]:
model_lr.summary.r2

In [None]:
lr_evaluator.evaluate(test_df)

In [None]:
model_rf = rf.fit(train_df)

In [None]:
test_df_rf = model_rf.transform(test_df)

In [None]:
model_pipeline = pipeline.fit(train_df)