In [None]:
# **** Note : Start the Spark maste and Slave before running below the code

    # Start master
        # C:\Spark\bin>spark-class org.apache.spark.deploy.master.Master

    # start Slave
        # C:\Spark\bin>spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 1 -m 4G spark://10.0.0.4:7077

In [1]:
import findspark
findspark.init('C:\Spark')
findspark.find()

import pyspark
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

spark

c:\users\bmi_cims\appdata\local\programs\python\python36\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
c:\users\bmi_cims\appdata\local\programs\python\python36\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


### Create DataFrame from SparkSession

In [31]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

# Create RDD
rdd = spark.sparkContext.parallelize(data)

# Using toDF() function
df = rdd.toDF()

# Column names to the DataFrame use toDF() method with column
df = rdd.toDF(columns)
df.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [3]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [None]:
## Writing entire data to different file formats

# CSV
df.write.csv('dataset.csv')

# JSON
data.write.save('dataset.json', format='json')

# Parquet
data.write.save('dataset.parquet', format='parquet')

## Writing selected data to different file formats

# CSV
data.select(['data', 'open', 'close', 'adjusted']).write.csv('dataset.csv')

# JSON
data.select(['data', 'open', 'close', 'adjusted']).write.save('dataset.json', format='json')

# Parquet
data.select(['data', 'open', 'close', 'adjusted']).write.save('dataset.parquet', format='parquet')

### Create DataFrame from CSV File

In [25]:
csv_file = 'data/IRIS.csv'

df = spark.read.csv(csv_file)

df.show(5)

+------------+-----------+------------+-----------+-----------+
|         _c0|        _c1|         _c2|        _c3|        _c4|
+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [15]:
# Read CSV File
df = spark.read.option("header",True).csv("data/IRIS.csv")

df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|           5|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



### Create DataFrame with schema

In [6]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

# Define Data
data2 = [("James","","Smith","36636","M",3000),("Michael","Rose","","40288","M",4000),
         ("Robert","","Williams","42114","M",4000),("Maria","Anne","Jones","39192","F",4000),
         ("Jen","Mary","Brown","","F",-1)]

# Define the Schema
schema = StructType([StructField("firstname",StringType(),True),StructField("middlename",StringType(),True),
                     StructField("lastname",StringType(),True),StructField("id", StringType(), True), 
                     StructField("gender", StringType(), True),StructField("salary", IntegerType(), True)])

# Create DataFrame from Schema
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



### Convert PySpark Dataframe to Pandas DataFrame

In [12]:
#  PySpark DataFrame can be converted to Python Pandas DataFrame using a function toPandas()

pandasDF = df.toPandas()
print(pandasDF)

  firstname middlename  lastname     id gender  salary
0     James                Smith  36636      M    3000
1   Michael       Rose            40288      M    4000
2    Robert             Williams  42114      M    4000
3     Maria       Anne     Jones  39192      F    4000
4       Jen       Mary     Brown             F      -1


In [4]:
import pandas as pd
from datetime import datetime, date

pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]})

df = spark.createDataFrame(pandas_df)
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



### Create DataFrame from Data sources

In [None]:

# 4.1 Creating DataFrame from CSV
df2 = spark.read.csv("/src/resources/file.csv")

#4.2 Creating from text (TXT) file
df2 = spark.read.text("/src/resources/file.txt")

#4.3 Creating from JSON file
df2 = spark.read.json("/src/resources/file.json")

### Other sources (Avro, Parquet, ORC, Kafka)

In [None]:
#5 PySpark Read and Write Parquet File

df.write.parquet("/tmp/out/people.parquet") 
parDF1=spark.read.parquet("/temp/out/people.parquet")

### Create Empty RDD in PySpark

In [7]:
#Creates Empty RDD
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD)

EmptyRDD[33] at emptyRDD at NativeMethodAccessorImpl.java:0


In [13]:
#Creates Empty RDD
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD)

# Create Empty DataFrame with Schema (StructType)

#Create Schema
from pyspark.sql.types import StructType,StructField, StringType

schema = StructType([StructField('firstname', StringType(), True),StructField('middlename', StringType(), True),
                     StructField('lastname', StringType(), True)])

#Create empty DataFrame from empty RDD
df = spark.createDataFrame(emptyRDD,schema)
df.printSchema()

# Convert Empty RDD to DataFrame
df1 = emptyRDD.toDF(schema)
df1.printSchema()

# Create Empty DataFrame with Schema.
df2 = spark.createDataFrame([], schema)
df2.printSchema()

# Create Empty DataFrame without Schema (no columns)
df3 = spark.createDataFrame([], StructType([]))
df3.printSchema()

EmptyRDD[54] at emptyRDD at NativeMethodAccessorImpl.java:0
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

root



### Convert Spark Nested Struct DataFrame to Pandas

In [14]:
# Nested structure elements
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
dataStruct = [(("James","","Smith"),"36636","M","3000"),(("Michael","Rose",""),"40288","M","4000"),
              (("Robert","","Williams"),"42114","M","4000"),(("Maria","Anne","Jones"),"39192","F","4000"),
              (("Jen","Mary","Brown"),"","F","-1")]

schemaStruct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
          StructField('gender', StringType(), True),
          StructField('salary', StringType(), True)
         ])

df = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
df.printSchema()

pandasDF2 = df.toPandas()
print(pandasDF2)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

                   name    dob gender salary
0      (James, , Smith)  36636      M   3000
1     (Michael, Rose, )  40288      M   4000
2  (Robert, , Williams)  42114      M   4000
3  (Maria, Anne, Jones)  39192      F   4000
4    (Jen, Mary, Brown)             F     -1


In [None]:
# Pyspark Write DataFrame to Parquet file format

data =[("James ","","Smith","36636","M",3000),("Michael ","Rose","","40288","M",4000),
       ("Robert ","","Williams","42114","M",4000),("Maria ","Anne","Jones","39192","F",4000),
       ("Jen","Mary","Brown","","F",-1)]
       
columns=["firstname","middlename","lastname","dob","gender","salary"]

df=spark.createDataFrame(data,columns)

df.write.parquet("/tmp/output/people.parquet")


In [None]:
# Pyspark Read Parquet file into DataFrame

parDF=spark.read.parquet("/tmp/output/people.parquet")

# Append or Overwrite an existing Parquet file
df.write.mode('append').parquet("/tmp/output/people.parquet")
df.write.mode('overwrite').parquet("/tmp/output/people.parquet")

In [None]:
# Executing SQL queries DataFrame

parqDF.createOrReplaceTempView("ParquetTable")
parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")

# Creating a table on Parquet file

spark.sql("CREATE TEMPORARY VIEW PERSON USING parquet OPTIONS (path \"/tmp/output/people.parquet\")")
spark.sql("SELECT * FROM PERSON").show()

# Create Parquet partition file
df.write.partitionBy("gender","salary").mode("overwrite").parquet("/tmp/output/people2.parquet")

# Retrieving from a partitioned Parquet file
parDF2=spark.read.parquet("/tmp/output/people2.parquet/gender=M")
parDF2.show(truncate=False)

# Creating a table on Partitioned Parquet file
spark.sql("CREATE TEMPORARY VIEW PERSON2 USING parquet OPTIONS (path \"/tmp/output/people2.parquet/gender=F\")")
spark.sql("SELECT * FROM PERSON2" ).show()


In [None]:
# Complete Example of PySpark read and write Parquet file

import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("parquetFile").getOrCreate()
data =[("James ","","Smith","36636","M",3000),("Michael ","Rose","","40288","M",4000),
       ("Robert ","","Williams","42114","M",4000),("Maria ","Anne","Jones","39192","F",4000),
       ("Jen","Mary","Brown","","F",-1)]

columns=["firstname","middlename","lastname","dob","gender","salary"]
df=spark.createDataFrame(data,columns)
df.write.mode("overwrite").parquet("/tmp/output/people.parquet")
parDF1=spark.read.parquet("/tmp/output/people.parquet")
parDF1.createOrReplaceTempView("parquetTable")
parDF1.printSchema()
parDF1.show(truncate=False)

parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
parkSQL.show(truncate=False)

spark.sql("CREATE TEMPORARY VIEW PERSON USING parquet OPTIONS (path \"/tmp/output/people.parquet\")")
spark.sql("SELECT * FROM PERSON").show()

df.write.partitionBy("gender","salary").mode("overwrite").parquet("/tmp/output/people2.parquet")

parDF2=spark.read.parquet("/tmp/output/people2.parquet/gender=M")
parDF2.show(truncate=False)

spark.sql("CREATE TEMPORARY VIEW PERSON2 USING parquet OPTIONS (path \"/tmp/output/people2.parquet/gender=F\")")
spark.sql("SELECT * FROM PERSON2" ).show()
