The first step in this tutorial, is to download _bank.csv_. This file is already uploaded into "https://s3.amazonaws.com/apache-zeppelin/tutorial/bank/bank.csv" and you can download it

In [6]:
bankDataset = sc.textFile("file:///home/cloudera/Downloads/bank.csv")

In [7]:
bankDataset.take(10)

[u'"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 u'30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 u'33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"no"',
 u'35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"no"',
 u'30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"no"',
 u'59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"',
 u'35;"management";"single";"tertiary";"no";747;"no";"no";"cellular";23;"feb";141;2;176;3;"failure";"no"',
 u'36;"self-employed";"married";"tertiary";"no";307;"yes";"no";"cellular";14;"may";341;1;330;2;"other";"no"',
 u'39;"technician";"married";"secondary";"no";147;"yes";"no";"cellular";6;"may";151;2;-1

Data Cleaning Using Map and Filter
In the following cells, you create a DataFrame programmatically with three steps.

+ Create an RDD of tuples from the original bankDataset 
+ Create the schema represented by a StructType matching the structure of tuples in the RDD created in the step 1. 
+ Apply the schema to the RDD via createDataFrame method provided by SQLContext. 

In [8]:
#bankRDD = bankDataset.map(lambda s: s.split(";")).filter(lambda s: s[0] != "\"age\"").map(lambda s:(int(s[0]), str(s[1]).replace("\"", ""), str(s[2]).replace("\"", ""), str(s[3]).replace("\"", ""), int(s[5]) ))
bankRDD = bankDataset.map(lambda s: s.split(";"))

In [12]:
bankRDD.take(1)

[[u'"age"',
  u'"job"',
  u'"marital"',
  u'"education"',
  u'"default"',
  u'"balance"',
  u'"housing"',
  u'"loan"',
  u'"contact"',
  u'"day"',
  u'"month"',
  u'"duration"',
  u'"campaign"',
  u'"pdays"',
  u'"previous"',
  u'"poutcome"',
  u'"y"']]

In [13]:
bankRDD  = bankRDD .filter(lambda s: s[0] != "\"age\"")

In [14]:
bankRDD.take(1)

[[u'30',
  u'"unemployed"',
  u'"married"',
  u'"primary"',
  u'"no"',
  u'1787',
  u'"no"',
  u'"no"',
  u'"cellular"',
  u'19',
  u'"oct"',
  u'79',
  u'1',
  u'-1',
  u'0',
  u'"unknown"',
  u'"no"']]

In [15]:
bankRDD = bankRDD.map(lambda s:(int(s[0]), str(s[1]).replace("\"", ""), str(s[2]).replace("\"", ""), str(s[3]).replace("\"", ""), int(s[5]) ))

In [17]:
bankRDD.take(2)

[(30, 'unemployed', 'married', 'primary', 1787),
 (33, 'services', 'married', 'secondary', 4789)]

Dataframe Schema
+ We created an RDD of touples from the original file, in the previous step; for example, a touple was exposed as (30, 'unemployed', 'married', 'primary', 1787).
+ Now, you create a schema by a StructType matching the structure of tuples in the bankRDD. StructType is the data type representing rows. 
+ A StructType object comprises a list of StructFields. 
+ Then, apply the schema to the RDD via createDataFrame method provided by SQLContext.

In [19]:
from pyspark.sql.types import *

In [20]:
bankSchema = StructType([StructField("age", IntegerType(), False),StructField("job", StringType(), False),StructField("marital", StringType(), False),StructField("education", StringType(), False),StructField("balance", IntegerType(), False)])

In [22]:
# Spark SQL
bankdf = sqlContext.createDataFrame(bankRDD,bankSchema)


In [23]:
bankdf.show(5)

+---+-----------+-------+---------+-------+
|age|        job|marital|education|balance|
+---+-----------+-------+---------+-------+
| 30| unemployed|married|  primary|   1787|
| 33|   services|married|secondary|   4789|
| 35| management| single| tertiary|   1350|
| 30| management|married| tertiary|   1476|
| 59|blue-collar|married|secondary|      0|
+---+-----------+-------+---------+-------+
only showing top 5 rows



Register DataFrame as a Temporary table
+ Running following code, registers a temporary table, bank. Registering a DataFrame as a table allows you to run SQL queries over its data.

# Spark SQL Operations

In [25]:
bankdf.registerTempTable("bank")

In [26]:
bank_5 = sqlContext.sql("select * from bank LIMIT 5")

In [27]:
bank_5.printSchema()

root
 |-- age: integer (nullable = false)
 |-- job: string (nullable = false)
 |-- marital: string (nullable = false)
 |-- education: string (nullable = false)
 |-- balance: integer (nullable = false)



In [29]:
bank_3 = sqlContext.sql("select age, count(1) value "
"from bank " 
"where age < 30 " 
"group by age " 
"order by age")

In [30]:
bank_3.show()

+---+-----+
|age|value|
+---+-----+
| 19|    4|
| 20|    3|
| 21|    7|
| 22|    9|
| 23|   20|
| 24|   24|
| 25|   44|
| 26|   77|
| 27|   94|
| 28|  103|
| 29|   97|
+---+-----+



In [31]:
bank_4 = sqlContext.sql("select age, education, sum(balance) bal from bank "
"group by age, education")

In [32]:
bank_4.show()

+---+---------+------+
|age|education|   bal|
+---+---------+------+
| 55|  primary| 40416|
| 28|  primary|  1961|
| 58|secondary| 45864|
| 31|secondary|104499|
| 73|  unknown|   519|
| 52| tertiary| 18538|
| 46|  unknown|  9570|
| 25| tertiary| 27302|
| 19|  unknown|  1169|
| 66|  primary|   953|
| 39|  primary| 25197|
| 69|secondary|   745|
| 42|secondary| 83363|
| 57|  unknown|  6726|
| 63| tertiary|   133|
| 36| tertiary|130534|
| 77|  primary|  1360|
| 50|  primary| 42974|
| 80|secondary| 17070|
| 53|secondary| 62148|
+---+---------+------+
only showing top 20 rows



# DataFrame Operations

In [38]:
from pyspark.sql.functions import sum

In [40]:
bankdf_1 = bankdf.groupBy("age","education").agg(sum("balance").alias('balance'))

In [41]:
bankdf_1.show()

+---+---------+-------+
|age|education|balance|
+---+---------+-------+
| 55|  primary|  40416|
| 28|  primary|   1961|
| 58|secondary|  45864|
| 31|secondary| 104499|
| 73|  unknown|    519|
| 52| tertiary|  18538|
| 46|  unknown|   9570|
| 25| tertiary|  27302|
| 19|  unknown|   1169|
| 66|  primary|    953|
| 39|  primary|  25197|
| 69|secondary|    745|
| 42|secondary|  83363|
| 57|  unknown|   6726|
| 63| tertiary|    133|
| 36| tertiary| 130534|
| 77|  primary|   1360|
| 50|  primary|  42974|
| 80|secondary|  17070|
| 53|secondary|  62148|
+---+---------+-------+
only showing top 20 rows



In [42]:
from pyspark.sql.functions import col

In [47]:
#bankdf_2 = bankdf.filter(bankdf['age'] < 30).groupBy("age").count().sort(col("age").desc())

bankdf_2 = bankdf.filter(bankdf['age'] < 30).groupBy("age").count().sort(col("age"))

In [48]:
bankdf_2.show()

+---+-----+
|age|count|
+---+-----+
| 19|    4|
| 20|    3|
| 21|    7|
| 22|    9|
| 23|   20|
| 24|   24|
| 25|   44|
| 26|   77|
| 27|   94|
| 28|  103|
| 29|   97|
+---+-----+



In [49]:
bankdf.show(2)

+---+----------+-------+---------+-------+
|age|       job|marital|education|balance|
+---+----------+-------+---------+-------+
| 30|unemployed|married|  primary|   1787|
| 33|  services|married|secondary|   4789|
+---+----------+-------+---------+-------+
only showing top 2 rows



In [50]:
bankdf_3 = bankdf.withColumnRenamed("marital", "marital_status")\
                 .withColumnRenamed("balance","Account_balance")

In [51]:
bankdf_3.show(3)

+---+----------+--------------+---------+---------------+
|age|       job|marital_status|education|Account_balance|
+---+----------+--------------+---------+---------------+
| 30|unemployed|       married|  primary|           1787|
| 33|  services|       married|secondary|           4789|
| 35|management|        single| tertiary|           1350|
+---+----------+--------------+---------+---------------+
only showing top 3 rows



In [52]:
from pyspark.sql.functions import concat_ws

In [58]:
bankdf_4 = bankdf.select(concat_ws('_', bankdf['age'], bankdf['job']).alias('age_job'),'marital','balance')

In [59]:
bankdf_4.show(3)

+-------------+-------+-------+
|      age_job|marital|balance|
+-------------+-------+-------+
|30_unemployed|married|   1787|
|  33_services|married|   4789|
|35_management| single|   1350|
+-------------+-------+-------+
only showing top 3 rows



In [60]:
from pyspark.sql.functions import countDistinct

In [61]:
bankdf_5 = bankdf.agg(countDistinct(bankdf.job, bankdf.marital).alias('c'))

In [62]:
bankdf_5.show()

+---+
|  c|
+---+
| 35|
+---+



In [63]:
bankdf_6 = bankdf.agg(countDistinct(bankdf.marital).alias('c'))
bankdf_6.show()

+---+
|  c|
+---+
|  3|
+---+



In [64]:
from pyspark.sql.functions import datediff
df = sqlContext.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2'])
df.select(datediff(df.d2, df.d1).alias('diff')).collect()

[Row(diff=32)]

In [66]:
from pyspark.sql.functions import format_number
df = sqlContext.createDataFrame([(5.12349,)], ['a'])
df2 = df.select(format_number(df['a'], 4).alias('v'))
df2.show()

+------+
|     v|
+------+
|5.1235|
+------+



In [72]:
from pyspark.sql.functions import date_format
from pyspark.sql.functions import lit
import random

In [81]:
from pyspark.sql.functions import udf

In [85]:

def date_rand():
    date_YYYY = str(random.randint(1950, 2000))
    date_MM   = str(random.randint(1, 12))
    date_DD   = str(random.randint(1, 28))
    date_sep  = '-'
    return  date_YYYY + date_MM + date_DD 

udfdate_rand = udf(date_rand, StringType())

In [92]:
bankdf.show(3)

+---+----------+-------+---------+-------+
|age|       job|marital|education|balance|
+---+----------+-------+---------+-------+
| 30|unemployed|married|  primary|   1787|
| 33|  services|married|secondary|   4789|
| 35|management| single| tertiary|   1350|
+---+----------+-------+---------+-------+
only showing top 3 rows



In [89]:
bankdf_w_date = bankdf.withColumn("Date", udfdate_rand())

In [90]:
bankdf_w_date.show()

Py4JJavaError: An error occurred while calling o732.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 41.0 failed 1 times, most recent failure: Lost task 0.0 in stage 41.0 (TID 1629, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 139, in load_stream
    yield self._read_with_length(stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
    return self.loads(obj)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 422, in loads
    return pickle.loads(obj)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1159, in <lambda>
    return lambda *a: dataType.fromInternal(a)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 568, in fromInternal
    return _create_row(self.names, values)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1163, in _create_row
    row = Row(*values)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1210, in __new__
    raise ValueError("No args or kwargs")
ValueError: (ValueError('No args or kwargs',), <function <lambda> at 0x7f87329c09b0>, ())

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.apply(python.scala:405)
	at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.apply(python.scala:370)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1843)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:212)
	at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:165)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:174)
	at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1499)
	at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1499)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:53)
	at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086)
	at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$execute$1(DataFrame.scala:1498)
	at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$collect(DataFrame.scala:1505)
	at org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1375)
	at org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1374)
	at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099)
	at org.apache.spark.sql.DataFrame.head(DataFrame.scala:1374)
	at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1456)
	at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:170)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 139, in load_stream
    yield self._read_with_length(stream)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
    return self.loads(obj)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 422, in loads
    return pickle.loads(obj)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1159, in <lambda>
    return lambda *a: dataType.fromInternal(a)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 568, in fromInternal
    return _create_row(self.names, values)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1163, in _create_row
    row = Row(*values)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1210, in __new__
    raise ValueError("No args or kwargs")
ValueError: (ValueError('No args or kwargs',), <function <lambda> at 0x7f87329c09b0>, ())

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.apply(python.scala:405)
	at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.apply(python.scala:370)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	... 1 more
