# pandas DF

In [1]:
import pandas as pd

data = [['Scott', 50], ['Jeff', 45], ['Thomas', 54], ['Ann', 34]]

# Create the pandas DataFrame
pandasDF = pd.DataFrame(data, columns=['Name', 'Age'])

# print dataframe.
print(pandasDF)

     Name  Age
0   Scott   50
1    Jeff   45
2  Thomas   54
3     Ann   34


# pandas DF 转换成 spark DF

In [2]:
sparkDF = spark.createDataFrame(pandasDF)
sparkDF.printSchema()
sparkDF.show()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)

+------+---+
|  Name|Age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+



# pandas DF 转换成 spark DF，并且指定列名

In [7]:
#sparkDF=spark.createDataFrame(pandasDF.astype(str))
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
mySchema = StructType([ StructField("First Name", StringType(), True)\
                       ,StructField("Age", IntegerType(), True)])

sparkDF2 = spark.createDataFrame(pandasDF, schema=mySchema)
sparkDF2.printSchema()
sparkDF2.show()

root
 |-- First Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+----------+---+
|First Name|Age|
+----------+---+
|     Scott| 50|
|      Jeff| 45|
|    Thomas| 54|
|       Ann| 34|
+----------+---+



# spark DF 转换成 pandas DF，并且使用pyarrow

[pyarrow](https://spark.apache.org/docs/3.0.1/sql-pyspark-pandas-with-arrow.html)

Apache Arrow in PySpark
Apache Arrow is an in-memory columnar data format that is used in Spark to efficiently transfer data between JVM and Python processes. This currently is most beneficial to Python users that work with Pandas/NumPy data. Its usage is not automatic and might require some minor changes to configuration or code to take full advantage and ensure compatibility. This guide will give a high-level description of how to use Arrow in Spark and highlight any differences when working with Arrow-enabled data.

In [14]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")

pandasDF2 = sparkDF2.select("*").toPandas

In [15]:
pandasDF2

<bound method PandasConversionMixin.toPandas of DataFrame[First Name: string, Age: int]>

In [16]:
spark.conf.get("spark.sql.execution.arrow.enabled")

'true'

In [17]:
spark.conf.get("spark.sql.execution.arrow.pyspark.fallback.enabled")

'true'