# WITHCOLUMN() + WITHCOLUMNRENAMED()

In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-withcolumn-2")
    .getOrCreate()
)


In [29]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)

In [3]:
# Change DataType using PySpark withColumn()
df.withColumn("salary",col("salary").cast("Integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [4]:
# Update The Value of an Existing Column
df.withColumn("salary", col("salary") * 10).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M| 30000|
|  Michael|      Rose|        |2000-05-19|     M| 40000|
|   Robert|          |Williams|1978-09-05|     M| 40000|
|    Maria|      Anne|   Jones|1967-12-01|     F| 40000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   -10|
+---------+----------+--------+----------+------+------+



In [5]:
# Update The Value of an Existing Column - squared value of column salary
df.withColumn("new_salary", pow(col("salary"), lit(2))).show()

+---------+----------+--------+----------+------+------+----------+
|firstname|middlename|lastname|       dob|gender|salary|new_salary|
+---------+----------+--------+----------+------+------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000| 9000000.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000|     1.6E7|
|   Robert|          |Williams|1978-09-05|     M|  4000|     1.6E7|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|     1.6E7|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|       1.0|
+---------+----------+--------+----------+------+------+----------+



In [6]:
# Rename Column Name
df.withColumnRenamed("gender","sex").show(truncate=False) 

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |          |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |        |2000-05-19|M  |4000  |
|Robert   |          |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



In [None]:
# Drop Column From PySpark DataFrame
df.drop("new_salary").show() 

In [None]:
df.withColumn("salary_increment", expr("salary * 0.15")) \
  .withColumn("new_salary", expr("salary + salary_increment")) \
  .show()


Which of the following code blocks returns a DataFrame with an added column to DataFrame transactionsDf that shows the unix epoch timestamps in column transactionDate as strings in the format month/day/year in column transactionDateFormatted?
>

- `+————————————-+—————————+—–———+——————-+—————————+———-+———————————————+`
- `|transactionId|predError|value|storeId|productId| f  |transactionDate|`
- `+————————————-+—————————+————–+——————-+—————————+———-+———————————————+`
- `| 1           | 3       | 4   | 25    | 1       |null| 1587915332    |`
- `| 2           | 6       | 7   | 2     | 2       |null| 1586815312    |`
- `| 3           | 3       | null| 25    | 3       |null| 1585824821    |`
- `| 4           | null    | null| 3     | 2       |null| 1583244275    |`
- `| 5           | null    | null| null  | 2       |null| 1575285427    |`
- `| 6           | 3       | 2   | 25    | 2       |null| 1572733275    |`
- `+————————————-+—————————+—–———+——————-+—————————+———-+———————————————+`

>
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="dd/MM/yyyy"))`
- `transactionsDf.withColumnRenamed("transactionDate", "transactionDateFormatted", from_unixtime("transactionDateFormatted", format="MM/dd/yyyy"))`
- `transactionsDf.apply(from_unixtime(format="MM/dd/yyyy")).asColumn("transactionDateFormatted")`
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="MM/dd/yyyy"))`
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate"))`

In [7]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [8]:
transactionsDf.printSchema()
transactionsDf.show()

root
 |-- transactionId: integer (nullable = true)
 |-- predError: integer (nullable = true)
 |-- value: integer (nullable = true)
 |-- storeId: integer (nullable = true)
 |-- productId: integer (nullable = true)
 |-- f: integer (nullable = true)
 |-- transactionDate: long (nullable = true)

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
+-------------+---------+-----+-------+---------+----+--------------

In [9]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="dd/MM/yyyy")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|              26/04/2020|
|            2|        6|    7|      2|        2|null|     1586815312|              13/04/2020|
|            3|        3| null|     25|        3|null|     1585824821|              02/04/2020|
|            4|     null| null|      3|        2|null|     1583244275|              03/03/2020|
|            5|     null| null|   null|        2|null|     1575285427|              02/12/2019|
|            6|        3|    2|     25|        2|null|     1572733275|              02/11/2019|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [10]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="MM/dd/yyyy")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|              04/26/2020|
|            2|        6|    7|      2|        2|null|     1586815312|              04/13/2020|
|            3|        3| null|     25|        3|null|     1585824821|              04/02/2020|
|            4|     null| null|      3|        2|null|     1583244275|              03/03/2020|
|            5|     null| null|   null|        2|null|     1575285427|              12/02/2019|
|            6|        3|    2|     25|        2|null|     1572733275|              11/02/2019|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [11]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|     2020-04-26 12:35:32|
|            2|        6|    7|      2|        2|null|     1586815312|     2020-04-13 19:01:52|
|            3|        3| null|     25|        3|null|     1585824821|     2020-04-02 07:53:41|
|            4|     null| null|      3|        2|null|     1583244275|     2020-03-03 11:04:35|
|            5|     null| null|   null|        2|null|     1575285427|     2019-12-02 09:17:07|
|            6|        3|    2|     25|        2|null|     1572733275|     2019-11-02 19:21:15|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [12]:
# TypeError: withColumnRenamed() takes 3 positional arguments but 4 were given
transactionsDf.withColumnRenamed("transactionDate", "transactionDateFormatted", from_unixtime("transactionDateFormatted", format="MM/dd/yyyy")).show()

# AttributeError: 'DataFrame' object has no attribute 'apply'
transactionsDf.apply(from_unixtime(format="MM/dd/yyyy")).asColumn("transactionDateFormatted").show()

TypeError: withColumnRenamed() takes 3 positional arguments but 4 were given

The code block displayed below contains an error. The code block should use Python method find_most_freq_letter to find the letter present most in column itemName of DataFrame itemsDf and return it in a new column most_frequent_letter. Find the error.
>
Code block:
>
- `find_most_freq_letter_udf = udf(find_most_freq_letter)`
- `itemsDf.withColumn(“most_frequent_letter”, find_most_freq_letter(“itemName”))`

The code block shown below should return a copy of DataFrame transactionsDf without columns value and productId and with an additional column associateId that has the value 5. Choose the answer that correctly fills the blanks in the code block to accomplish this.
>
- `transactionsDf.__1__(__2__, __3__).__4__(__5__, ‘value’)`
>
- `1. withColumn 2. 'associateId' 3. 5 4. remove 5. 'productId'`
- `1. withNewColumn 2. associateId 3. lit(5) 4. drop 5. productId`
- `1. withColumn 2. 'associateId' 3. lit(5) 4. drop 5. 'productId'`
- `1. withColumnRenamed 2. 'associateId' 3. 5 4. drop 5. 'productId'`
- `1. withColumn 2. col(associateId) 3. lit(5) 4. drop 5. col(productId)`

In [13]:
transactionsDf.withColumn('associateId', lit(5)).drop('productId', 'value').show()

+-------------+---------+-------+----+---------------+-----------+
|transactionId|predError|storeId|   f|transactionDate|associateId|
+-------------+---------+-------+----+---------------+-----------+
|            1|        3|     25|null|     1587915332|          5|
|            2|        6|      2|null|     1586815312|          5|
|            3|        3|     25|null|     1585824821|          5|
|            4|     null|      3|null|     1583244275|          5|
|            5|     null|   null|null|     1575285427|          5|
|            6|        3|     25|null|     1572733275|          5|
+-------------+---------+-------+----+---------------+-----------+



In [14]:
# AssertionError: col should be Column
transactionsDf.withColumn('associateId', 5).remove('productId', 'value')

# AttributeError: 'DataFrame' object has no attribute 'withNewColumn'
transactionsDf.withNewColumn('associateId', lit(5)).drop(productId, 'value')

# Py4JError: An error occurred while calling o133.withColumnRenamed
transactionsDf.withColumnRenamed('associateId', 5).drop('productId', 'value').show()

# col(productId)
transactionsDf.withColumn(col(associateId), lit(5)).drop(col(productId), 'value').show()

TypeError: col should be Column

Which of the following code blocks adds a column predErrorSqrt to DataFrame transactionsDf that is the square root of column predError?

In [15]:
transactionsDf.withColumn("predErrorSqrt", sqrt(col("predError"))).show()


+-------------+---------+-----+-------+---------+----+---------------+------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|     predErrorSqrt|
+-------------+---------+-----+-------+---------+----+---------------+------------------+
|            1|        3|    4|     25|        1|null|     1587915332|1.7320508075688772|
|            2|        6|    7|      2|        2|null|     1586815312| 2.449489742783178|
|            3|        3| null|     25|        3|null|     1585824821|1.7320508075688772|
|            4|     null| null|      3|        2|null|     1583244275|              null|
|            5|     null| null|   null|        2|null|     1575285427|              null|
|            6|        3|    2|     25|        2|null|     1572733275|1.7320508075688772|
+-------------+---------+-----+-------+---------+----+---------------+------------------+



In [16]:
transactionsDf.select(sqrt("predError")).show()

+------------------+
|   SQRT(predError)|
+------------------+
|1.7320508075688772|
| 2.449489742783178|
|1.7320508075688772|
|              null|
|              null|
|1.7320508075688772|
+------------------+



In [None]:
# NameError: name 'predError' is not defined
transactionsDf.withColumn("predErrorSqrt", sqrt(predError))

# NameError: name 'predError' is not defined
transactionsDf.select(sqrt(predError))

# TypeError: 'Column' object is not callable
transactionsDf.withColumn("predErrorSqrt", col("predError").sqrt())


Which of the following code blocks returns a copy of DataFrame transactionsDf where the column storeId has been converted to string type?

>

- `transactionsDf.withColumn("storeId", convert("storeId", "string"))`
- `transactionsDf.withColumn("storeId", col("storeId", "string"))`
- `transactionsDf.withColumn("storeId", col("storeId").convert("string"))`
- `transactionsDf.withColumn("storeId", col("storeId").cast("string"))`
- `transactionsDf.withColumn("storeId", convert("storeId").as("string"))`

In [17]:
transactionsDf.withColumn("storeId", col("storeId").cast("string")).printSchema()


root
 |-- transactionId: integer (nullable = true)
 |-- predError: integer (nullable = true)
 |-- value: integer (nullable = true)
 |-- storeId: string (nullable = true)
 |-- productId: integer (nullable = true)
 |-- f: integer (nullable = true)
 |-- transactionDate: long (nullable = true)



In [None]:
# NameError: name 'convert' is not defined
transactionsDf.withColumn("storeId", convert("storeId", "string"))

# TypeError: _() takes 1 positional argument but 2 were given
transactionsDf.withColumn("storeId", col("storeId", "string"))

# TypeError: 'Column' object is not callable
transactionsDf.withColumn("storeId", col("storeId").convert("string"))

# SyntaxError: invalid syntax
transactionsDf.withColumn("storeId", convert("storeId").as("string"))

Which of the following code blocks returns a DataFrame with a new column salary_increment and all previously existing columns.
>
- `df.selectExpr("*", "salary * 0.15")`
- `df.withColumn("salary_increment", expr("salary * 0.15"))`
- `df.withColumn("salary_increment", "salary * 0.15")`
- `df.selectExpr("*", "salary * 0.15 as salary_increment")`

In [18]:
df.selectExpr("*", "salary * 0.15").show()


+---------+----------+--------+----------+------+------+---------------+
|firstname|middlename|lastname|       dob|gender|salary|(salary * 0.15)|
+---------+----------+--------+----------+------+------+---------------+
|    James|          |   Smith|1991-04-01|     M|  3000|         450.00|
|  Michael|      Rose|        |2000-05-19|     M|  4000|         600.00|
|   Robert|          |Williams|1978-09-05|     M|  4000|         600.00|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|         600.00|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|          -0.15|
+---------+----------+--------+----------+------+------+---------------+



In [19]:
df.withColumn("salary_increment", expr("salary * 0.15")).show()


+---------+----------+--------+----------+------+------+----------------+
|firstname|middlename|lastname|       dob|gender|salary|salary_increment|
+---------+----------+--------+----------+------+------+----------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          450.00|
|  Michael|      Rose|        |2000-05-19|     M|  4000|          600.00|
|   Robert|          |Williams|1978-09-05|     M|  4000|          600.00|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|          600.00|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           -0.15|
+---------+----------+--------+----------+------+------+----------------+



In [20]:
df.selectExpr("*", "salary * 0.15 as salary_increment").show()

+---------+----------+--------+----------+------+------+----------------+
|firstname|middlename|lastname|       dob|gender|salary|salary_increment|
+---------+----------+--------+----------+------+------+----------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          450.00|
|  Michael|      Rose|        |2000-05-19|     M|  4000|          600.00|
|   Robert|          |Williams|1978-09-05|     M|  4000|          600.00|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|          600.00|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           -0.15|
+---------+----------+--------+----------+------+------+----------------+



In [None]:
# AssertionError: col should be Column
df.withColumn("salary_increment", "salary * 0.15")


You are given a DataFrame as shown below.
>
- `+——-+—-+———-+`
- `|BatchID|Year|CourseName|`
- `+——-+—-+———-+`
- `| X1 |2020| Scala |`
- `| X2 |2020| Python |`
- `| X3 |null| Java |`
- `| X4 |2021| Scala |`
- `| X5 |null| Python |`
- `| X6 |2021| Spark |`
- `+——-+—-+———-+`
>
You want to transform the Year column and replace all nulls with the value 2021. Choose the correct option from the given code blocks.
>
- `df.withColumn("Year", coalesce(col("Year"), "2021"))`
- `df.withColumn("Year", coalesce(col("Year"), lit("2021")))`
- `df.withColumn("Year", coalesce(col("Year"), col("2021")))`
- `df.withColumn("Year", coalesce(col("Year"), expr("2021")))`

In [34]:
data = [
        ('X1',2020,'Scala'),
        ('X2',2020,'Python'),
        ('X3',None,'Java'),
        ('X4',2021,'Scala'),
        ('X5',None,'Python'),
        ('X6',2021,'Spark')]

df = spark.createDataFrame(data=data, schema = ['BatchID','Year','CourseName'])
df.printSchema()
df.show()

root
 |-- BatchID: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- CourseName: string (nullable = true)

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|null|      Java|
|     X4|2021|     Scala|
|     X5|null|    Python|
|     X6|2021|     Spark|
+-------+----+----------+



In [22]:
df.withColumn("Year", expr("coalesce(Year, '2021')")).show()

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|2021|      Java|
|     X4|2021|     Scala|
|     X5|2021|    Python|
|     X6|2021|     Spark|
+-------+----+----------+



In [27]:
#df.withColumn("Year", expr("coalesce(Year, '2021')")).show()
df.withColumn("Year", coalesce(col("Year"), lit("2021"))).show()
df.withColumn("Year", coalesce(col("Year"), expr("2021"))).show()

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|2021|      Java|
|     X4|2021|     Scala|
|     X5|2021|    Python|
|     X6|2021|     Spark|
+-------+----+----------+

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|2021|      Java|
|     X4|2021|     Scala|
|     X5|2021|    Python|
|     X6|2021|     Spark|
+-------+----+----------+



In [40]:
df.withColumn("Year", expr("ifnull(Year, '2021')")).show()
#df.withColumn("Year", ifnull(col("Year"), "2021")).show()

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|2021|      Java|
|     X4|2021|     Scala|
|     X5|2021|    Python|
|     X6|2021|     Spark|
+-------+----+----------+



In [None]:
df.withColumn("Year", coalesce(col("Year"), expr("2021"))).show()

In [None]:
# AnalysisException: cannot resolve '`2021`' given input columns: [BatchID, CourseName, Year]
df.withColumn("Year", coalesce(col("Year"), "2021"))

# AnalysisException: cannot resolve '`2021`' given input columns: [BatchID, CourseName, Year]
df.withColumn("Year", coalesce(col("Year"), col("2021")))


Which function will you use to add a new field in your DataFrame with a current timestamp?

In [30]:
df.withColumn("now", current_timestamp()).show(truncate=False)

+---------+----------+--------+----------+------+------+-----------------------+
|firstname|middlename|lastname|dob       |gender|salary|now                    |
+---------+----------+--------+----------+------+------+-----------------------+
|James    |          |Smith   |1991-04-01|M     |3000  |2021-11-25 14:27:15.822|
|Michael  |Rose      |        |2000-05-19|M     |4000  |2021-11-25 14:27:15.822|
|Robert   |          |Williams|1978-09-05|M     |4000  |2021-11-25 14:27:15.822|
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |2021-11-25 14:27:15.822|
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |2021-11-25 14:27:15.822|
+---------+----------+--------+----------+------+------+-----------------------+



In [32]:
df.withColumn("now", now())

NameError: name 'now' is not defined

In [31]:
df.withColumn("Year", expr("ifnull(Year, '2021')")).show()

AnalysisException: cannot resolve 'Year' given input columns: [dob, firstname, gender, lastname, middlename, salary]; line 1 pos 7;
'Project [firstname#644, middlename#645, lastname#646, dob#647, gender#648, salary#649L, 'ifnull('Year, 2021) AS Year#693]
+- LogicalRDD [firstname#644, middlename#645, lastname#646, dob#647, gender#648, salary#649L], false
