# This notebook shows how to save dataframes to hdfs/hive
The example shows:
- saving directly to hdfs
- saving as a Hive table using the metastore

In [1]:
sqlContext

<pyspark.sql.context.HiveContext at 0x7fc9a5beaa50>

In [2]:
sc.version

u'1.5.2'

### Create a test dataframe.

In [1]:
nums = sc.parallelize(xrange(0,100),5)

In [2]:
from pyspark.sql import Row
numsRDD = nums.map(lambda x : Row(num=x))

In [3]:
numsRDD.take(1)

[Row(num=0)]

In [4]:
df = sqlContext.createDataFrame(numsRDD)

### Save the dataframe to hdfs in parquet format.

In [7]:
df.write.mode("overwrite").save("hdfs://hadoop:9000/user/hive/warehouse/nums")

### Save the dataframe as a Hive/hdfs table.

In [5]:
df.write.mode("overwrite").saveAsTable("bar")

In [1]:
sqlContext.sql("show tables").show()

+---------+-----------+
|tableName|isTemporary|
+---------+-----------+
|      bar|      false|
|      foo|      false|
|      src|      false|
+---------+-----------+



In [6]:
sqlContext.sql("Select * from bar limit 10").show()

+---+
|num|
+---+
| 40|
| 41|
| 42|
| 43|
| 44|
| 45|
| 46|
| 47|
| 48|
| 49|
+---+

