Creating Dataframes from RDD programmatically

Import findspark and initiate.
Then import pyspark

In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark

Start SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL example").getOrCreate()

In [3]:
sc = spark.sparkContext

Create an RDD from the structured text file

In [4]:
clines = sc.textFile("customers.txt")

In [None]:
clines.take(4)

Import types from sql to be able to create StructTypes

In [5]:
from pyspark.sql.types import *

In [6]:
cfields = clines.map(lambda l: l.split("\t"))
customers = cfields.map(lambda p: (p[0], p[1], p[2], p[3], p[4]))

In [None]:
customers

The schema encoded in a string.

In [7]:
schemaString = "cid cname ccity cstate czip"

In [8]:
ccolumns = [StructField(column_name, StringType(), True) for column_name in schemaString.split()]
schema = StructType(ccolumns)

In [None]:
type(ccolumns)

Apply the schema to the RDD to create the dataframe

In [9]:
customerDF = spark.createDataFrame(customers, schema)

In [10]:
customerDF.write.parquet('mycust')

In [None]:
customerDF.printSchema()

In [11]:
cust = spark.read.parquet('mycust')

In [None]:
customerDF.select("cname").show()

In [None]:
customerDF.select(customerDF['cname'], customerDF['ccity']).show(5)

In [None]:
customerDF.filter(customerDF['cstate'] == 'CA').show()

In [13]:
cust.filter(cust['cstate'] == 'CA').show()
cust.explain()

+-----+----------------+---------------+------+-----+
|  cid|           cname|          ccity|cstate| czip|
+-----+----------------+---------------+------+-----+
| 5577|      Mary Smith|        Modesto|    CA|95350|
| 1745|      Mary Smith|Rowland Heights|    CA|91748|
|11444|Kathleen Patrick|      San Diego|    CA|92109|
| 8846|    Thomas Smith|          Indio|    CA|92201|
| 6237|  Bobby Anderson|       El Cajon|    CA|92020|
| 4085|       Mary Carr|  Panorama City|    CA|91402|
| 8705|  Patricia Smith|       Stockton|    CA|95207|
| 3669|       Mary Soto| San Bernardino|    CA|92410|
| 6101|      Mary Smith|    Los Angeles|    CA|90033|
|11697|  Jessica Thomas|  Laguna Niguel|    CA|92677|
| 1295|   Theresa Lopez|       Winnetka|    CA|91306|
| 4814|     Paul Suarez|    Simi Valley|    CA|93065|
| 8530|   William Smith|       Highland|    CA|92346|
| 3846|    Ronald Lewis|        Ontario|    CA|91764|
|10476|     John Hodges|       Cerritos|    CA|90703|
|10243|  Donna Anderson|    

In [None]:
customerDF.groupBy("cstate").count().show()

Create a temp view so that SQL queries can be run

In [None]:
customerDF.createOrReplaceTempView("customers")

In [None]:
cStateCount50 = spark.sql("SELECT cstate, count(*) as sttcount FROM customers GROUP BY cstate HAVING sttcount>=50")

In [None]:
cStateCount50.show()

In [None]:
cStateCount50.printSchema()

In [None]:
type(cStateCount50)

In [None]:
rdd1 = sc.parallelize([1,2,3,4,5,6,7,8,9,10,20,40,30,88,78,66,77,44,84,22], numSlices=3)

In [None]:
rdd1.glom().collect()

In [None]:
rdd1.coalesce(2).glom().collect()

In [None]:
rdd1.count()

In [None]:
# Cannot be used to increase the number of partitions
rdd1.coalesce(6).glom().collect()

In [None]:
rdd1.repartition(2).glom().collect()

In [None]:
sc.stop()

In [None]:
spark.stop()