In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Column APi Methods').getOrCreate()

In [31]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType

schema = StructType([
    StructField('CustomerKey',StringType(),True),
    StructField('Prefix',StringType(),True),
    StructField('FirstName',StringType(),True),
    StructField('LastName',StringType(),True),
    StructField('BirthDate',DateType(),True),
    StructField('MaritalStatus',StringType(),True),
    StructField('Gender',StringType(),True),
    StructField('EmailAddress',StringType(),True),
    StructField('AnnualIncome',StringType(),True),
    StructField('EducationLevel',StringType(),True),
    StructField('Occupation',StringType(),True),
    StructField('HomeOwner',StringType(),True),
])

customers = spark.read.option('header','true').csv('customers.csv',schema = schema)
customers.printSchema()

root
 |-- CustomerKey: string (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)



In [7]:
customers.show(5)

+-----------+------+---------+--------+---------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|CustomerKey|Prefix|FirstName|LastName|BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|
+-----------+------+---------+--------+---------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|      11000|   MR.|      JON|    YANG| 4/8/1966|            M|     M|jon24@adventure-w...|    $90,000 |            2|     Bachelors|Professional|        Y|
|      11001|   MR.|   EUGENE|   HUANG|5/14/1965|            S|     M|eugene10@adventur...|    $60,000 |            3|     Bachelors|Professional|        N|
|      11002|   MR.|    RUBEN|  TORRES|8/12/1965|            M|     M|ruben35@adventure...|    $60,000 |            3|     Bachelors|Professional|        Y|
|      11003|   MS.|  CHRISTY|     ZHU|2/15/1968|         

In [32]:
# Returns this column aliased with a new name or names 
# (in the case of expressions that return more than one column, such as explode)

alias_column = customers.select(customers.Gender.alias('GEN'))
alias_column.show(2)

+---+
|GEN|
+---+
|  M|
|  M|
+---+
only showing top 2 rows



In [33]:
customers.select(customers.FirstName).orderBy(customers.FirstName.asc()).show(4)

+---------+
|FirstName|
+---------+
|    AARON|
|    AARON|
|    AARON|
|    AARON|
+---------+
only showing top 4 rows



In [22]:
customers.select(customers.BirthDate.cast('date')).printSchema()

root
 |-- BirthDate: date (nullable = true)

