In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, struct


data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show()
df.display()





root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



firstname,middlename,lastname,dob,gender,salary
James,,Smith,1991-04-01,M,3000
Michael,Rose,,2000-05-19,M,4000
Robert,,Williams,1978-09-05,M,4000
Maria,Anne,Jones,1967-12-01,F,4000
Jen,Mary,Brown,1980-02-17,F,-1


In [0]:
#changing datatype

df.withColumn("salary",col("salary").cast("Integer")).printSchema()

#chamging the values of the column
df.withColumn("salary",col("salary")*1000).show()

#create a column using the existing column
df.withColumn("CopiedColumn",col("salary")* -1).show()

#create a new column 


from pyspark.sql.functions import lit


df.withColumn("Country", lit("USA")).show()
df.withColumn("Country", lit("USA")) \
  .withColumn("anotherColumn",lit("anotherValue")) \
  .show()

#rename
df.withColumnRenamed("gender","sex") \
  .show(truncate=False)
#truncate=it does not cut the length of the string,show full value


#droping the column

df.drop("gender") \
.show()

# using selectExpr()
df.selectExpr("salary", "salary + 100 as new_salary").show()

#filter and expr
from pyspark.sql.functions import expr

df.filter(expr("salary > 1000")).show()



root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+----------+------+-------+
|firstname|middlename|lastname|       dob|gender| salary|
+---------+----------+--------+----------+------+-------+
|    James|          |   Smith|1991-04-01|     M|3000000|
|  Michael|      Rose|        |2000-05-19|     M|4000000|
|   Robert|          |Williams|1978-09-05|     M|4000000|
|    Maria|      Anne|   Jones|1967-12-01|     F|4000000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -1000|
+---------+----------+--------+----------+------+-------+

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   S