In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [5]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [6]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

df=df.drop('number')
df=df.na.drop()
df.count(), len(df.columns)

(1064, 10)

In [8]:
# Let's create a categorical variable to denote if the humidity is not low. 
# If the value is less than 25%,then we want the categorical value to be 0, 
# otherwise the categorical value should be 1. We can create this categorical 
# variable as a column in a DataFrame using Binarizer:

binarizer = Binarizer(threshold=24.99999,inputCol='relative_humidity_3pm',outputCol='label')
binarized_df = binarizer.transform(df)
binarized_df.select('relative_humidity_3pm','label').show(10)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
|    76.74000000000046|  1.0|
|   33.930000000000256|  1.0|
|   21.385656725200974|  0.0|
|    74.92000000000041|  1.0|
|   24.030000000000427|  0.0|
|     68.0500000000012|  1.0|
+---------------------+-----+
only showing top 10 rows



In [13]:
#  Let's aggregate the features we will use to make predictions into a single column:

assembler = VectorAssembler(inputCols=featureColumns,outputCol='features')
assembled = assembler.transform(binarized_df)
assembled.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[918.060000000008...|
|[917.347688117709...|
|[923.040000000008...|
|[920.502751175919...|
|[921.160000000003...|
|[915.300000000006...|
|[915.598867513280...|
|[918.070000000002...|
|[920.080000000006...|
|[915.010000000011...|
+--------------------+
only showing top 10 rows



In [14]:
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed=13234)
trainingData.count(),testData.count()

(854, 210)

In [16]:
# The labelCol argument is the column we are trying to predict, 
# featuresCol specifies the aggregated features column, maxDepth 
# is stopping criterion for tree induction based on maximum depth 
# of tree, minInstancesPerNode is stopping criterion for tree induction 
# based on minimum number of samples in a node, and impurity is the impurity 
# measure used to split nodes.

dt = DecisionTreeClassifier(labelCol='label',featuresCol='features',maxDepth=5,
                          minInstancesPerNode=20,impurity='gini')

In [17]:
# We can create a model by training the decision tree. 
# This is done by executing it in a Pipeline:

pipeline=Pipeline(stages=[dt])
model=pipeline.fit(trainingData)

In [18]:
# Let's make predictions using our test data set:

predictions = model.transform(testData)
predictions.select('prediction','label').show(10)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 10 rows



In [19]:
# Finally, let's save the predictions to a CSV file. We will save only the prediction and label columns to a CSV file:

predictions.select('prediction','label').write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv",
                                                   format="com.databricks.spark.csv",
                                                   header='true')