# WITHCOLUMN() + WITHCOLUMNRENAMED()

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-withcolumn")
    .getOrCreate()
)

In [3]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)

In [4]:
# Change DataType using PySpark withColumn()
df.withColumn("salary",col("salary").cast("Integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [5]:
# Update The Value of an Existing Column
df.withColumn("salary", col("salary") * 10).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M| 30000|
|  Michael|      Rose|        |2000-05-19|     M| 40000|
|   Robert|          |Williams|1978-09-05|     M| 40000|
|    Maria|      Anne|   Jones|1967-12-01|     F| 40000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   -10|
+---------+----------+--------+----------+------+------+



In [6]:
# Update The Value of an Existing Column - squared value of column salary
df.withColumn("new_salary", pow(col("salary"), lit(2))).show()

+---------+----------+--------+----------+------+------+----------+
|firstname|middlename|lastname|       dob|gender|salary|new_salary|
+---------+----------+--------+----------+------+------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000| 9000000.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000|     1.6E7|
|   Robert|          |Williams|1978-09-05|     M|  4000|     1.6E7|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|     1.6E7|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|       1.0|
+---------+----------+--------+----------+------+------+----------+



In [7]:
# Rename Column Name
df.withColumnRenamed("gender","sex").show(truncate=False) 

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |          |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |        |2000-05-19|M  |4000  |
|Robert   |          |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



In [8]:
# Drop Column From PySpark DataFrame
df.drop("new_salary").show() 

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



Which of the following code blocks returns a DataFrame with an added column to DataFrame transactionsDf that shows the unix epoch timestamps in column transactionDate as strings in the format month/day/year in column transactionDateFormatted?
>

- `+————————————-+—————————+—–———+——————-+—————————+———-+———————————————+`
- `|transactionId|predError|value|storeId|productId| f  |transactionDate|`
- `+————————————-+—————————+————–+——————-+—————————+———-+———————————————+`
- `| 1           | 3       | 4   | 25    | 1       |null| 1587915332    |`
- `| 2           | 6       | 7   | 2     | 2       |null| 1586815312    |`
- `| 3           | 3       | null| 25    | 3       |null| 1585824821    |`
- `| 4           | null    | null| 3     | 2       |null| 1583244275    |`
- `| 5           | null    | null| null  | 2       |null| 1575285427    |`
- `| 6           | 3       | 2   | 25    | 2       |null| 1572733275    |`
- `+————————————-+—————————+—–———+——————-+—————————+———-+———————————————+`

>
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="dd/MM/yyyy"))`
- `transactionsDf.withColumnRenamed("transactionDate", "transactionDateFormatted", from_unixtime("transactionDateFormatted", format="MM/dd/yyyy"))`
- `transactionsDf.apply(from_unixtime(format="MM/dd/yyyy")).asColumn("transactionDateFormatted")`
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="MM/dd/yyyy"))`
- `transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate"))`

In [9]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [10]:
transactionsDf.printSchema()
transactionsDf.show()

root
 |-- transactionId: integer (nullable = true)
 |-- predError: integer (nullable = true)
 |-- value: integer (nullable = true)
 |-- storeId: integer (nullable = true)
 |-- productId: integer (nullable = true)
 |-- f: integer (nullable = true)
 |-- transactionDate: long (nullable = true)

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
+-------------+---------+-----+-------+---------+----+--------------

In [11]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="dd/MM/yyyy")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|              26/04/2020|
|            2|        6|    7|      2|        2|null|     1586815312|              13/04/2020|
|            3|        3| null|     25|        3|null|     1585824821|              02/04/2020|
|            4|     null| null|      3|        2|null|     1583244275|              03/03/2020|
|            5|     null| null|   null|        2|null|     1575285427|              02/12/2019|
|            6|        3|    2|     25|        2|null|     1572733275|              02/11/2019|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [12]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate", format="MM/dd/yyyy")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|              04/26/2020|
|            2|        6|    7|      2|        2|null|     1586815312|              04/13/2020|
|            3|        3| null|     25|        3|null|     1585824821|              04/02/2020|
|            4|     null| null|      3|        2|null|     1583244275|              03/03/2020|
|            5|     null| null|   null|        2|null|     1575285427|              12/02/2019|
|            6|        3|    2|     25|        2|null|     1572733275|              11/02/2019|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [13]:
transactionsDf.withColumn("transactionDateFormatted", from_unixtime("transactionDate")).show()

+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|transactionDateFormatted|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+
|            1|        3|    4|     25|        1|null|     1587915332|     2020-04-26 12:35:32|
|            2|        6|    7|      2|        2|null|     1586815312|     2020-04-13 19:01:52|
|            3|        3| null|     25|        3|null|     1585824821|     2020-04-02 07:53:41|
|            4|     null| null|      3|        2|null|     1583244275|     2020-03-03 11:04:35|
|            5|     null| null|   null|        2|null|     1575285427|     2019-12-02 09:17:07|
|            6|        3|    2|     25|        2|null|     1572733275|     2019-11-02 19:21:15|
+-------------+---------+-----+-------+---------+----+---------------+------------------------+



In [14]:
# TypeError: withColumnRenamed() takes 3 positional arguments but 4 were given
transactionsDf.withColumnRenamed("transactionDate", "transactionDateFormatted", from_unixtime("transactionDateFormatted", format="MM/dd/yyyy")).show()

# AttributeError: 'DataFrame' object has no attribute 'apply'
transactionsDf.apply(from_unixtime(format="MM/dd/yyyy")).asColumn("transactionDateFormatted").show()

TypeError: withColumnRenamed() takes 3 positional arguments but 4 were given

The code block displayed below contains an error. The code block should use Python method find_most_freq_letter to find the letter present most in column itemName of DataFrame itemsDf and return it in a new column most_frequent_letter. Find the error.
>
Code block:
>
- `find_most_freq_letter_udf = udf(find_most_freq_letter)`
- `itemsDf.withColumn(“most_frequent_letter”, find_most_freq_letter(“itemName”))`