In [1]:
pip install pyspark


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
pyspark.__version__


'3.5.1'

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExemples").getOrCreate()


In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("Spark_Exemple").getOrCreate()
print(spark)
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6])
print("RDD count:",+rdd.count())

<pyspark.sql.session.SparkSession object at 0x00000217EAB654D0>
RDD count: 6


## Creating PySpark DataFrame

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[2]').appName("Spark_Exe").getOrCreate()
print(spark)
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"] 
df = spark.createDataFrame(data = data,schema = columns)
df.show() # to get the default 20 rows of data

<pyspark.sql.session.SparkSession object at 0x00000217EAB654D0>
+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



df.printSchema()

### Change the column name

In [8]:
df1= df.withColumnRenamed("lastname","LName")
df1.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|   LName|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [9]:
df1.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- LName: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



### Creating DataFrame with nestered columns

In [10]:
from pyspark.sql.types import StringType, StructType, StructField,IntegerType
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark_Exe").getOrCreate()
data2 = dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]
schema2 = StructType([
    StructField("Name", StructType([
        StructField("firstname",StringType()),
        StructField("middlename",StringType()),
        StructField("Lastname",StringType())
    ])),
    StructField("DOB",StringType()),
    StructField("Gender",StringType()),
    StructField("Salary", IntegerType())
])
df2 = spark.createDataFrame(data = data2 , schema = schema2)
df2.printSchema()
df2.show()




root
 |-- Name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- Lastname: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: integer (nullable = true)

+--------------------+----------+------+------+
|                Name|       DOB|Gender|Salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



## Using PySpark StructType – To rename a nested column in Dataframe

In [11]:
from pyspark.sql.functions import *
schema3 = StructType([
    StructField("fname",StringType()),
    StructField("middlename",StringType()),
    StructField("lname",StringType())])
df2.select(col("Name").cast(schema3),col("DOB"), col("Gender"), col("Salary")).printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: integer (nullable = true)



### Using Select – To rename nested elements.

In [12]:
from pyspark.sql.functions import *
df2.select(col("Name.firstname").alias("fname"),
           col("Name.middlename").alias("mname"),
           col("Name.lastname").alias("lname"),
           col("DOB"),
           col("Gender"),
           col("Salary")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: integer (nullable = true)



### Using PySpark DataFrame withColumn – To rename nested columns

In [13]:
df3 = df2.withColumn("fname",col("Name.firstname")).withColumn("mname",col("Name.middlename")).withColumn("lname",col("Name.lastname")).drop("Name")
df3.printSchema()


root
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)



In [14]:
df3.show()

+----------+------+------+-------+-----+--------+
|       DOB|Gender|Salary|  fname|mname|   lname|
+----------+------+------+-------+-----+--------+
|1991-04-01|     M|  3000|  James|     |   Smith|
|2000-05-19|     M|  4000|Michael| Rose|        |
|1978-09-05|     M|  4000| Robert|     |Williams|
|1967-12-01|     F|  4000|  Maria| Anne|   Jones|
|1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+----------+------+------+-------+-----+--------+



### Using toDF() – To change all columns in a PySpark DataFrame

In [15]:
new_columns = ['col1','col2','col3','col4']
df2.toDF(*new_columns).printSchema()

root
 |-- col1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- Lastname: string (nullable = true)
 |-- col2: string (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: integer (nullable = true)



In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

# Example 1
df.withColumnRenamed("dob","DateOfBirth").printSchema()
# Example 2   
df2 = df.withColumnRenamed("dob","DateOfBirth") \
    .withColumnRenamed("salary","salary_amount")
df2.printSchema()

# Example 3 
schema2 = StructType([
    StructField("fname",StringType()),
    StructField("middlename",StringType()),
    StructField("lname",StringType())])
    
df.select(col("name").cast(schema2),
  col("dob"),
  col("gender"),
  col("salary")) \
    .printSchema()    

# Example 4 
df.select(col("name.firstname").alias("fname"),
  col("name.middlename").alias("mname"),
  col("name.lastname").alias("lname"),
  col("dob"),col("gender"),col("salary")) \
  .printSchema()
  
# Example 5
df4 = df.withColumn("fname",col("name.firstname")) \
      .withColumn("mname",col("name.middlename")) \
      .withColumn("lname",col("name.lastname")) \
      .drop("name")
df4.printSchema()

#Example 7
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(*newColumns).printSchema()

# Example 6
'''
not working
old_columns = Seq("dob","gender","salary","fname","mname","lname")
new_columns = Seq("DateOfBirth","Sex","salary","firstName","middleName","lastName")
columnsList = old_columns.zip(new_columns).map(f=>{col(f._1).as(f._2)})
df5 = df4.select(columnsList:_*)
df5.printSchema()
'''

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlena

'\nnot working\nold_columns = Seq("dob","gender","salary","fname","mname","lname")\nnew_columns = Seq("DateOfBirth","Sex","salary","firstName","middleName","lastName")\ncolumnsList = old_columns.zip(new_columns).map(f=>{col(f._1).as(f._2)})\ndf5 = df4.select(columnsList:_*)\ndf5.printSchema()\n'

In [17]:
df5= df.withColumnRenamed("dob","DateOfBirth") \
    .withColumnRenamed("salary","salary_amount")
df5.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)

