In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark_Select()").getOrCreate()

In [0]:
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(5)

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



###Select Single & Multiple Columns From PySpark

In [0]:
#df.select("firstname").show()
df.select(df.firstname).show()

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+



In [0]:
#df.select("firstname","lastname").show()
df.select(df.firstname,df.lastname).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



In [0]:
#df.select(df["firstname"]).show()
df.select(df["firstname"],df["lastname"]).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



In [0]:
#By using col() function
from pyspark.sql.functions import col
df.select(col("firstname"),col("lastname")).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



### Select All Columns From List

In [0]:
df.select("*").show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



###Select Columns by Index

In [0]:
#Selects first 3 columns and top 3 rows
df.columns

Out[15]: ['firstname', 'lastname', 'country', 'state']

In [0]:
df.select(df.columns[:3]).show()

+---------+--------+-------+
|firstname|lastname|country|
+---------+--------+-------+
|    James|   Smith|    USA|
|  Michael|    Rose|    USA|
|   Robert|Williams|    USA|
|    Maria|   Jones|    USA|
+---------+--------+-------+



In [0]:
#Selects columns 2 to 4  and top 3 rows
df.select(df.columns[2:4]).show(3)

+-------+-----+
|country|state|
+-------+-----+
|    USA|   CA|
|    USA|   NY|
|    USA|   CA|
+-------+-----+
only showing top 3 rows



##Select Nested Struct Columns from PySpark

In [0]:
from pyspark.sql.types import StructType,StructField,StringType

data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

schema = StructType([
    StructField("name",StructType([StructField('firstname',StringType(),True),
                                   StructField('middelname',StringType(),True),
                                   StructField('lastname',StringType(),True)])),
    StructField("state",StringType(),True),
    StructField("gender",StringType(),True)
])

df = spark.createDataFrame(data,schema)
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middelname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+--------------------+-----+------+
|                name|state|gender|
+--------------------+-----+------+
|{James, null, Smith}|   OH|     M|
|      {Anna, Rose, }|   NY|     F|
| {Julia, , Williams}|   OH|     F|
|{Maria, Anne, Jones}|   NY|     M|
|  {Jen, Mary, Brown}|   NY|     M|
|{Mike, Mary, Will...|   OH|     M|
+--------------------+-----+------+



In [0]:
df.select("name").show()

+--------------------+
|                name|
+--------------------+
|{James, null, Smith}|
|      {Anna, Rose, }|
| {Julia, , Williams}|
|{Maria, Anne, Jones}|
|  {Jen, Mary, Brown}|
|{Mike, Mary, Will...|
+--------------------+



In [0]:
#df.select("name.lastname").show()
#df.select(df.name.lastname).show()
#df.select(df["name"]["lastname"]).show()
df.select(col("name.lastname")).show()

+--------+
|lastname|
+--------+
|   Smith|
|        |
|Williams|
|   Jones|
|   Brown|
|Williams|
+--------+



In [0]:
df.select("name.*").show()

+---------+----------+--------+
|firstname|middelname|lastname|
+---------+----------+--------+
|    James|      null|   Smith|
|     Anna|      Rose|        |
|    Julia|          |Williams|
|    Maria|      Anne|   Jones|
|      Jen|      Mary|   Brown|
|     Mike|      Mary|Williams|
+---------+----------+--------+

