In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("PySpark_With_Column").getOrCreate()

data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["First Name", "Middle Name", "Last Name", "DOB", "Gender", "Salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate = False)



root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)

+----------+-----------+---------+----------+------+------+
|First Name|Middle Name|Last Name|DOB       |Gender|Salary|
+----------+-----------+---------+----------+------+------+
|James     |           |Smith    |1991-04-01|M     |3000  |
|Michael   |Rose       |         |2000-05-19|M     |4000  |
|Robert    |           |Williams |1978-09-05|M     |4000  |
|Maria     |Anne       |Jones    |1967-12-01|F     |4000  |
|Jen       |Mary       |Brown    |1980-02-17|F     |-1    |
+----------+-----------+---------+----------+------+------+



In [None]:
# Casting to Integer
df2 = df.withColumn("salary",col("Salary").cast("Float"))
df2.printSchema()
df2.show(truncate = False)

root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- salary: float (nullable = true)

+----------+-----------+---------+----------+------+------+
|First Name|Middle Name|Last Name|DOB       |Gender|salary|
+----------+-----------+---------+----------+------+------+
|James     |           |Smith    |1991-04-01|M     |3000.0|
|Michael   |Rose       |         |2000-05-19|M     |4000.0|
|Robert    |           |Williams |1978-09-05|M     |4000.0|
|Maria     |Anne       |Jones    |1967-12-01|F     |4000.0|
|Jen       |Mary       |Brown    |1980-02-17|F     |-1.0  |
+----------+-----------+---------+----------+------+------+



In [None]:
df3 = df.withColumn("salary",col("Salary") * 100)
df3.printSchema()
df3.show(truncate = False)

root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+----------+------+------+
|First Name|Middle Name|Last Name|DOB       |Gender|salary|
+----------+-----------+---------+----------+------+------+
|James     |           |Smith    |1991-04-01|M     |300000|
|Michael   |Rose       |         |2000-05-19|M     |400000|
|Robert    |           |Williams |1978-09-05|M     |400000|
|Maria     |Anne       |Jones    |1967-12-01|F     |400000|
|Jen       |Mary       |Brown    |1980-02-17|F     |-100  |
+----------+-----------+---------+----------+------+------+



In [None]:
df4 = df.withColumn("Copied Column", col("Salary")* -1)
df4.printSchema()
df4.show()

root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Copied Column: long (nullable = true)

+----------+-----------+---------+----------+------+------+-------------+
|First Name|Middle Name|Last Name|       DOB|Gender|Salary|Copied Column|
+----------+-----------+---------+----------+------+------+-------------+
|     James|           |    Smith|1991-04-01|     M|  3000|        -3000|
|   Michael|       Rose|         |2000-05-19|     M|  4000|        -4000|
|    Robert|           | Williams|1978-09-05|     M|  4000|        -4000|
|     Maria|       Anne|    Jones|1967-12-01|     F|  4000|        -4000|
|       Jen|       Mary|    Brown|1980-02-17|     F|    -1|            1|
+----------+-----------+---------+----------+------+------+-------------+



In [None]:
df5 = df.withColumn("Country", lit("USA"))
df5.printSchema()
df5.show()

root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Country: string (nullable = false)

+----------+-----------+---------+----------+------+------+-------+
|First Name|Middle Name|Last Name|       DOB|Gender|Salary|Country|
+----------+-----------+---------+----------+------+------+-------+
|     James|           |    Smith|1991-04-01|     M|  3000|    USA|
|   Michael|       Rose|         |2000-05-19|     M|  4000|    USA|
|    Robert|           | Williams|1978-09-05|     M|  4000|    USA|
|     Maria|       Anne|    Jones|1967-12-01|     F|  4000|    USA|
|       Jen|       Mary|    Brown|1980-02-17|     F|    -1|    USA|
+----------+-----------+---------+----------+------+------+-------+



In [None]:
df6 = df.withColumn("Country", lit("USA"))\
.withColumn("anotherColumn", lit("anotherValue"))
df6.printSchema()

root
 |-- First Name: string (nullable = true)
 |-- Middle Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Country: string (nullable = false)
 |-- anotherColumn: string (nullable = false)



In [None]:
df.withColumnRenamed("gender", "sex")\
.show(truncate = False)

+----------+-----------+---------+----------+---+------+
|First Name|Middle Name|Last Name|DOB       |sex|Salary|
+----------+-----------+---------+----------+---+------+
|James     |           |Smith    |1991-04-01|M  |3000  |
|Michael   |Rose       |         |2000-05-19|M  |4000  |
|Robert    |           |Williams |1978-09-05|M  |4000  |
|Maria     |Anne       |Jones    |1967-12-01|F  |4000  |
|Jen       |Mary       |Brown    |1980-02-17|F  |-1    |
+----------+-----------+---------+----------+---+------+



In [None]:
df4.drop("Copied Column")\
.show(truncate = False)

+----------+-----------+---------+----------+------+------+
|First Name|Middle Name|Last Name|DOB       |Gender|Salary|
+----------+-----------+---------+----------+------+------+
|James     |           |Smith    |1991-04-01|M     |3000  |
|Michael   |Rose       |         |2000-05-19|M     |4000  |
|Robert    |           |Williams |1978-09-05|M     |4000  |
|Maria     |Anne       |Jones    |1967-12-01|F     |4000  |
|Jen       |Mary       |Brown    |1980-02-17|F     |-1    |
+----------+-----------+---------+----------+------+------+



In [None]:
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

schemaStruct = StructType([
    StructField("name", StructType([
        StructField("First Name", StringType(), True),
        StructField("Middle Name", StringType(), True),
        StructField("Last Name", StringType(), True),
    ])),
    StructField("DOB", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Salary", StringType(), True),
])

df7 = spark.createDataFrame(data = dataStruct, schema = schemaStruct)
df7.printSchema()
df7.show(truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- First Name: string (nullable = true)
 |    |-- Middle Name: string (nullable = true)
 |    |-- Last Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: string (nullable = true)

+--------------------+-----+------+------+
|name                |DOB  |Gender|Salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3000  |
|{Michael, Rose, }   |40288|M     |4000  |
|{Robert, , Williams}|42114|M     |4000  |
|{Maria, Anne, Jones}|39192|F     |4000  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



In [None]:
from pyspark.sql.functions import split
columns = ["Name", "Address"]
data = [("Robert, Smith", "Main St, Newark, NJ, 92537"),
        ("Maria, Garcia", "3456 Walnut St, Newark, NJ, 94732")]

dfFromData = spark.createDataFrame(data = data, schema = columns)

newDF = dfFromData \
.withColumn("First Name", split(col("Name"), ",")[0])\
.withColumn("Last Name", split(col("Name"), ",")[1])\
.withColumn("Address Line 1", split(col("Address"), ",")[0])\
.withColumn("City", split(col("Name"), ",")[1])\
.withColumn("State", split(col("Name"), ",")[2])\
.withColumn("Zipcode", split(col("Name"), ",")[3])\

finalDF = newDF.select("First Name", "Last Name", "Address Line 1", "City", "State", "Zipcode")
finalDF.printSchema()
finalDF.show(truncate = False)



root
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Address Line 1: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)

+----------+---------+--------------+-------+-----+-------+
|First Name|Last Name|Address Line 1|City   |State|Zipcode|
+----------+---------+--------------+-------+-----+-------+
|Robert    | Smith   |Main St       | Smith |NULL |NULL   |
|Maria     | Garcia  |3456 Walnut St| Garcia|NULL |NULL   |
+----------+---------+--------------+-------+-----+-------+

