# Chapter 6
## Putting structure on your Big Data with SparkSQL

Taking away labels means that we are in unsupervised learning territory. Spark has great support for clustering and dimensionality reduction algorithms.

# Manipulating DataFrames with SparkSQL schemas

In [1]:
raw_data = sc.textFile("./kddcup.data.gz")

In [9]:
from pyspark.sql import Row, SQLContext
sql_context = SQLContext(sc)
csv = raw_data.map(lambda l: l.split(","))

In [10]:
rows = csv.map(lambda p: Row(duration=int(p[0]), protocol=p[1], service=p[2]))

In [11]:
df = sql_context.createDataFrame(rows)
df.registerTempTable("rdd")

In [14]:
sql_context.sql("""SELECT duration FROM rdd WHERE protocol = 'tcp' AND duration > 2000""").show()

+--------+
|duration|
+--------+
|   12454|
|   10774|
|   13368|
|   10350|
|   10409|
|   14918|
|   10039|
|   15127|
|   25602|
|   13120|
|    2399|
|    6155|
|   11155|
|   12169|
|   15239|
|   10901|
|   15182|
|    9494|
|    7895|
|   11084|
+--------+
only showing top 20 rows



# Using the Spark DSL to build queries for structured data operations

In [15]:
df.select("duration").filter(df.duration>2000).filter(df.protocol=="tcp").show()

+--------+
|duration|
+--------+
|   12454|
|   10774|
|   13368|
|   10350|
|   10409|
|   14918|
|   10039|
|   15127|
|   25602|
|   13120|
|    2399|
|    6155|
|   11155|
|   12169|
|   15239|
|   10901|
|   15182|
|    9494|
|    7895|
|   11084|
+--------+
only showing top 20 rows

