### Create PySpark RDD

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [2]:
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
rdd = spark.sparkContext.parallelize(dept)

In [6]:
print(rdd)
print(type(rdd))

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274
<class 'pyspark.rdd.RDD'>


In [4]:
rdd.collect()

[('Finance', 10), ('Marketing', 20), ('Sales', 30), ('IT', 40)]

So if we compare with our first notebook we can see when we convert dataframe to rdd it is MapPartitionsRDD and if we create RDD using collection like list it would be ParallelCollectionRDD.

MapPartitionsRDD.collect() give us list containing row type data.

ParallelCollectionRDD.collect() give us list containing simple datatype like str,int


### Convert PySpark RDD to DataFrame

In [9]:
## Using rdd.toDF() function. 

df1 = rdd.toDF()

df1.show()

## If we want to have column den pass column list in toDF.

schemas = ["dept_name","dept_id"]

df2 = rdd.toDF(schemas)

df2.show()

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [10]:
## Using createDataframe()

deptDF = spark.createDataFrame(rdd, schema = schemas)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [11]:
## Using createDataFrame() with StructType schema


from pyspark.sql.types import StructType,StructField, StringType
deptSchema = StructType([       
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(rdd, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

