In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd

In [2]:
conf = SparkConf().setAppName('spark_pd').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [4]:
sc

In [3]:
df = spark.read.json('/usr/local/spark/examples/src/main/resources/people.json')

In [4]:
df

DataFrame[age: bigint, name: string]

In [5]:
df.show(3)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [6]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [9]:
df.filter(df['age'] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [10]:
df.groupBy('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates.

In [11]:
df.createOrReplaceTempView("people")
sqlDF = spark.sql("select * from people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Global temporary view is tied to a system preserved database global_temp, and we must use the qualified name to refer it, e.g. SELECT * FROM global_temp.view1

In [12]:
# Register the DataFrame as a global temporary view
df.createGlobalTempView("people")

In [13]:
spark.sql("select * from global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [14]:
# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Spark SQL支持两种将现有RDD转换为数据集的方法。 第一种方法使用反射来推断包含特定对象类型的RDD的架构。 这种基于反射的方法可以使代码更简洁，并且当您在编写Spark应用程序时已经了解架构时，可以很好地工作。

创建数据集的第二种方法是通过编程界面，该界面允许您构造模式，然后将其应用于现有的RDD。 尽管此方法较为冗长，但可以在运行时才知道列及其类型的情况下构造数据集。

## Inferring the Schema Using Reflection

Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes

Spark SQL可以将Row对象的RDD转换为DataFrame，从而推断数据类型

In [7]:
from pyspark.sql import Row

In [8]:
lines = sc.textFile("/usr/local/spark/examples/src/main/resources/people.txt")
lines

/usr/local/spark/examples/src/main/resources/people.txt MapPartitionsRDD[9] at textFile at NativeMethodAccessorImpl.java:0

In [9]:
parts = lines.map(lambda l: l.split(','))
parts.take(1)

[['Michael', ' 29']]

In [10]:
# 将键/值对列表作为kwargs传递给Row类来构造行
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
people.take(1)

[Row(age=29, name='Michael')]

In [11]:
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

In [12]:
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("select name from people where age between 13 and 19")
teenagers.show()

+------+
|  name|
+------+
|Justin|
+------+



In [20]:
teenagers.rdd

MapPartitionsRDD[47] at javaToPython at NativeMethodAccessorImpl.java:0

In [13]:
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teen_names = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teen_names:
    print(name)

Name: Justin


## Programmatically Specifying the Schema

1. 从原始RDD创建元组或列表的RDD；
2. 在第1步中创建的RDD中，创建一个由StructType表示的模式来匹配元组或列表的结构。
3. 通过SparkSession提供的createDataFrame方法将架构应用于RDD。

NullType, StringType, BinaryType, BooleanType, DateType, TimestampType, DecimalType, DoubleType
FloatType, ByteType, IntegerType 32, LongType 64, ShortType 16, ArrayType, MapType

In [14]:
from pyspark.sql.types import *

# sc = spark.sparkContext
lines = sc.textFile("/usr/local/spark/examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(','))
#  Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], int(p[1].strip())))


In [31]:
# The schema is encoded in a string.
schemaString = "name age"

In [15]:
# 字段名 字段类型 nullable
fields = [StructField('name', StringType(), True), StructField('age', ShortType(), True)]
schema = StructType(fields)

In [16]:
# Apply the schema to the RDD.
shemaPeople = spark.createDataFrame(people, schema)

In [17]:
shemaPeople.createOrReplaceTempView("people1")

In [18]:
shemaPeople.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [19]:
spark.sql("select name from people1").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



# Generic Load/Save Functions

In [38]:
df = spark.read.load("/usr/local/spark/examples/src/main/resources/users.parquet")
df.select("name", "favorite_color").show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+



In [39]:
df.select("name", "favorite_color").write.save("data/namesAndFavColors.parquet")

In [43]:
# 指定读取和保存 文件的格式
df = spark.read.load("/usr/local/spark/examples/src/main/resources/people.json", format="json")
df.select("name", "age").write.save("data/nameAndAges.parquet", format='parquet')

In [45]:
# 读取csv
df = spark.read.load("/usr/local/spark/examples/src/main/resources/people.csv", 
                    format="csv", sep=";", inferSchema=True, header=True)
df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [46]:
# Run SQL on files directly
df = spark.sql("select * from parquet.`/usr/local/spark/examples/src/main/resources/users.parquet`")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



### Bucketing, Sorting and Partitioning

In [20]:

# Bucketing and sorting are applicable only to persistent tables
df = spark.read.load("/usr/local/spark/examples/src/main/resources/people.json", format="json")
df.write.bucketBy(42, "name").sortBy('age').saveAsTable("people_bucketed")

AnalysisException: "Can not create the managed table('`people_bucketed`'). The associated location('file:/mnt/data1/workspace/data_analysis_mining/spark/tutorial/spark-warehouse/people_bucketed') already exists.;"

- saveAsTable(name, format=None, mode=None, partitionBy=None, **options)

    Saves the content of the DataFrame as the specified table.

- save(path=None, format=None, mode=None, partitionBy=None, **options)

    Saves the contents of the DataFrame to a data source.

In [21]:
spark.sql("select * from people_bucketed").show()

AnalysisException: 'Table or view not found: people_bucketed; line 1 pos 14'

In [22]:
#  partitioning can be used with both save and saveAsTable when using the Dataset APIs
df = spark.sql("select * from parquet.`/usr/local/spark/examples/src/main/resources/users.parquet`")
df.write.partitionBy("favorite_color").format("parquet").save("/data/namesPartByColor.parquet")

Py4JJavaError: An error occurred while calling o215.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:677)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:677)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:677)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:286)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:272)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:230)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 1 times, most recent failure: Lost task 0.0 in stage 11.0 (TID 11, localhost, executor driver): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:257)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Mkdirs failed to create file:/data/namesPartByColor.parquet/_temporary/0/_temporary/attempt_20200727194403_0011_m_000000_11/favorite_color=__HIVE_DEFAULT_PARTITION__ (exists=false, cwd=file:/mnt/data1/workspace/data_analysis_mining/spark/tutorial)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:455)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.newOutputWriter(FileFormatDataWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.write(FileFormatDataWriter.scala:260)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:245)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:248)
	... 10 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 32 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:257)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.IOException: Mkdirs failed to create file:/data/namesPartByColor.parquet/_temporary/0/_temporary/attempt_20200727194403_0011_m_000000_11/favorite_color=__HIVE_DEFAULT_PARTITION__ (exists=false, cwd=file:/mnt/data1/workspace/data_analysis_mining/spark/tutorial)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:455)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.newOutputWriter(FileFormatDataWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.write(FileFormatDataWriter.scala:260)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:245)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:248)
	... 10 more


## Parquet Files

列存储格式


In [51]:
peopleDF = spark.read.json("/usr/local/spark/examples/src/main/resources/people.json")
# DataFrames can be saved as Parquet files, maintaining the schema information.
peopleDF.write.parquet("data/people.parquet")

In [53]:
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
parquetFile = spark.read.parquet("data/people.parquet")

In [55]:
parquetFile.createOrReplaceTempView("parquetFile")
teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
teenagers.show()

+------+
|  name|
+------+
|Justin|
+------+



In [56]:
## Schema Merging
from pyspark.sql import Row

In [57]:
squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6))
                                  .map(lambda i: Row(single=i, doule= i ** 2)))

In [58]:
squaresDF.write.parquet("data/test_table/key=1")

In [59]:
# Create another DataFrame in a new partition directory,
# adding a new column and dropping an existing column
cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
                                .map(lambda i: Row(single=i, triple=i ** 3)))

In [60]:
cubesDF.write.parquet('data/test_table/key=2')

- option(key, value)

    Adds an input option for the underlying data source.

In [23]:
# Read the partitioned table
mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
mergedDF.printSchema()

root
 |-- doule: long (nullable = true)
 |-- single: long (nullable = true)
 |-- triple: long (nullable = true)
 |-- key: integer (nullable = true)



In [24]:
mergedDF.show()

+-----+------+------+---+
|doule|single|triple|key|
+-----+------+------+---+
| null|     6|   216|  2|
| null|     7|   343|  2|
| null|     8|   512|  2|
| null|     9|   729|  2|
| null|    10|  1000|  2|
|    1|     1|  null|  1|
|    4|     2|  null|  1|
|    9|     3|  null|  1|
|   16|     4|  null|  1|
|   25|     5|  null|  1|
+-----+------+------+---+



In [None]:
# Metadata Refreshing
spark.catalog.refreshTable("my_table")

In [25]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   people|       true|
|        |  people1|       true|
+--------+---------+-----------+



In [26]:
spark.conf.set("spark.sql.orc.enableVectorizedReader", "true")

In [66]:
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
path = "/usr/local/spark/examples/src/main/resources/people.json"
peopleDF = spark.read.json(path)

In [67]:
# The inferred schema can be visualized using the printSchema() method
peopleDF.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [68]:
# Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")

In [69]:
# SQL statements can be run by using the sql methods provided by spark
teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
teenagerNamesDF.show()

+------+
|  name|
+------+
|Justin|
+------+



In [75]:
import json
d1 = {'name':1, 'age':2}
josn_str = json.dumps(d1)
josn_str

'{"name": 1, "age": 2}'

In [78]:
# Alternatively, a DataFrame can be created for a JSON dataset represented by
# an RDD[String] storing one JSON object per string
# jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
otherPeopleRDD = sc.parallelize([josn_str])

In [79]:
otherPeople = spark.read.json(otherPeopleRDD)
otherPeople.show()

+---+----+
|age|name|
+---+----+
|  2|   1|
+---+----+



## Hive Tables

In [27]:
sc.stop()

In [28]:
from os.path import expanduser, join, abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row


In [29]:
warehouse_location = abspath('spark-warehouse')

In [30]:
warehouse_location

'/mnt/data1/workspace/data_analysis_mining/spark/tutorial/spark-warehouse'

In [31]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

In [32]:
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
# CREATE TABLE src(id int) USING hive OPTIONS(fileFormat 'parquet')
spark.sql("load data local inpath '/usr/local/spark/examples/src/main/resources/kv1.txt' into table src")

DataFrame[]

In [33]:
spark.sql("select count(*) from src").show()

+--------+
|count(1)|
+--------+
|    1000|
+--------+



In [34]:
results = spark.sql("select key, value from src where key < 10 order by key")
results.show()

+---+-----+
|key|value|
+---+-----+
|  0|val_0|
|  0|val_0|
|  0|val_0|
|  0|val_0|
|  0|val_0|
|  0|val_0|
|  2|val_2|
|  2|val_2|
|  4|val_4|
|  4|val_4|
|  5|val_5|
|  5|val_5|
|  5|val_5|
|  5|val_5|
|  5|val_5|
|  5|val_5|
|  8|val_8|
|  8|val_8|
|  9|val_9|
|  9|val_9|
+---+-----+



In [96]:
# The items in DataFrames are of type Row, which allows you to access each column by ordinal.
stringDS = results.rdd.map(lambda row: "Key: {}, Value: {}".format(row.key, row.value))
stringDS.take(2)

['Key: 0, Value: val_0', 'Key: 0, Value: val_0']

In [97]:
for record in stringDS.collect():
    print(record)

Key: 0, Value: val_0
Key: 0, Value: val_0
Key: 0, Value: val_0
Key: 2, Value: val_2
Key: 4, Value: val_4
Key: 5, Value: val_5
Key: 5, Value: val_5
Key: 5, Value: val_5
Key: 8, Value: val_8
Key: 9, Value: val_9


In [35]:
# You can also use DataFrames to create temporary views within a SparkSession.
Record = Row('key', 'Value')
recordsDF = spark.createDataFrame([Record(i, "val_" + str(i)) for i in range(1, 101)])
recordsDF

DataFrame[key: bigint, Value: string]

In [36]:
recordsDF.createOrReplaceTempView("records")
# Queries can then join DataFrame data with data stored in Hive.
spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()

+---+------+---+------+
|key| Value|key| value|
+---+------+---+------+
|  2| val_2|  2| val_2|
|  2| val_2|  2| val_2|
|  4| val_4|  4| val_4|
|  4| val_4|  4| val_4|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  8| val_8|  8| val_8|
|  8| val_8|  8| val_8|
|  9| val_9|  9| val_9|
|  9| val_9|  9| val_9|
| 10|val_10| 10|val_10|
| 10|val_10| 10|val_10|
| 11|val_11| 11|val_11|
| 11|val_11| 11|val_11|
| 12|val_12| 12|val_12|
| 12|val_12| 12|val_12|
+---+------+---+------+
only showing top 20 rows



## Apache Avro Data

spark-submit --packages org.apache.spark:spark-avro_2.12:2.4.6

spark-shell --packages org.apache.spark:spark-avro_2.12:2.4.6

In [102]:
df = spark.read.format("avro").load("/usr/local/spark/examples/src/main/resources/users.avro")
df.select("name", "favorite_color").write.format("avro").save("namesAndFavColors.avro")

AnalysisException: 'Failed to find data source: avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".;'