In [1]:
!pip install pyspark



In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .master ('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/BigDataColab25FallShalRitvikSinha
!ls

/content/drive/My Drive/BigDataColab25FallShalRitvikSinha
 10.Spark-ClassificationShalRitvikSinha.ipynb
 11_2.Spark-Classifier-evaluationShalRitvikSinha.ipynb
'6. MR-WordCountReducer_Shal_Ritvik_Sinha.ipynb'
 7.Spark-WordCountShal_Ritvik_Sinha.ipynb
 8.Spark-SQLShalRitvikSinha.ipynb
 8.Spark-StreamingShalRitvikSinha.ipynb
 9.Spark-Handling-missing-values.ShalRitvikSinha.ipynb
 ad-clicks.csv.gz
 Alice.txt
 BigDataShalRitvikSinhaTest1.ipynb
 BigDataShalRitvikSinhaTest2.ipynb
 buy-clicks.csv.gz
 Cheshire
 daily_weather.csv
 game-clicks.csv.gz
 hadoop-3.3.6
 hadoop-3.3.6.tar.gz
 hadoop-3.3.6.tar.gz.1
 join1_FileA.txt
 join1_FileB.txt
 join1_mapper.py
 join1_reducer.py
 join2_genchanA.txt
 join2_genchanB.txt
 join2_genchanc.txt
 join2_gennumA.txt
 join2_gennumB.txt
 join2_gennumC.txt
 join2_mapper.py
 join2_reducer.py
 make_join2data.py
 MT
 MT1_B
 MT2
 MT3
 MT_Q1_make_data.py
 MT_Q2_make_data.py
 MT_Q3_make_data.py
 out0
 out_4Data_Join
 out_4WC
 outDataJoin2
 out_Quiz3
 out_wordmedian
 pr

In [5]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [6]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('/content/drive/My Drive/BigDataColab25FallShalRitvikSinha/daily_weather.csv',format='com.databricks.spark.csv',header='true', inferSchema='true' )

df.columns



['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

Spark: What values are in the number column?

In [7]:
df.printSchema()

root
 |-- number: integer (nullable = true)
 |-- air_pressure_9am: double (nullable = true)
 |-- air_temp_9am: double (nullable = true)
 |-- avg_wind_direction_9am: double (nullable = true)
 |-- avg_wind_speed_9am: double (nullable = true)
 |-- max_wind_direction_9am: double (nullable = true)
 |-- max_wind_speed_9am: double (nullable = true)
 |-- rain_accumulation_9am: double (nullable = true)
 |-- rain_duration_9am: double (nullable = true)
 |-- relative_humidity_9am: double (nullable = true)
 |-- relative_humidity_3pm: double (nullable = true)



In [8]:
df.select("number").show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [9]:
df.describe("number").show()

+-------+------------------+
|summary|            number|
+-------+------------------+
|  count|              1095|
|   mean|             547.0|
| stddev|316.24357700987383|
|    min|                 0|
|    max|              1094|
+-------+------------------+



Spark: With the original dataset split into 80% for training and 20% for test
how many of the first 20 samples from the test set were correctly classified?

In [11]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [12]:
featureColumns = ['air_pressure_9am' , 'air_temp_9am', 'avg_wind_direction_9am' , 'avg_wind_speed_9am',
'max_wind_direction_9am' , 'max_wind_speed_9am' , 'rain_accumulation_9am','rain_duration_9am']

In [13]:
df =df.na.drop()

In [14]:
binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDF = binarizer.transform(df)

In [15]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(binarizedDF)

In [16]:
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed = 13234 )


In [17]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5,
minInstancesPerNode=20, impurity="gini")

In [18]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [19]:
predictions =model.transform(testData)

In [21]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       0.0|  1.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 20 rows



In [22]:
first_20 = predictions.select("prediction", "label").limit(20)
correct_count = first_20.filter(first_20.prediction == first_20.label).count()
print(correct_count)


17


If we split the data using 70% for training data and 30% for test data, how
many samples would the training set have (using seed 13234)?

In [23]:
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=13234)
training_count = trainingData.count()


In [24]:
print(training_count)

731
