In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("transformation")
    .getOrCreate()
)


In [None]:
# # define the schema

# from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# schema = StringType([
#     StructField("emp_id", IntegerType, True ),
#     StructField("empName", StringType, True),
#     StructField("empGender", StringType, True),
#     StructField("empSalary", FloatType, True),
#     StructField("empCountry",StringType, True)

# ])

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("emp_schema_test")
    .getOrCreate()
)

emp_data = [
    (1, "John Doe", "Male", 70000.0, "USA"),
    (2, "Emily Clark", "Female", 72000.0, "Canada"),
    (3, "Michael Smith", "Male", 68000.0, "UK"),
    (4, "Sophia Lee", "Female", 75000.0, "Australia"),
    (5, "Daniel Kim", "Male", 69000.0, "India"),
    (5, "Daniel Kim", "Male", 69000.0, "India"),
    (6, "Olivia Brown", "Female", 71000.0, "USA"),
    (7, "James Wilson", "Male", 70500.0, "Canada"),
    (8, "Ava Johnson", "Female", 69500.0, "Germany"),
    (9, "Ethan Davis", "Male", 73000.0, "UK"),
    (10, "Isabella Garcia", "Female", 74000.0, "USA"),
    (9, "Ethan Davis", "Male", 73000.0, "UK"),
    (10, "Isabella Garcia", "Female", 74000.0, "USA"),
    (11, "Logan Martinez", "Male", 68000.0, "Mexico"),
    (12, "Mia Taylor", "Female", 72000.0, "France"),
    (13, "Lucas Anderson", "Male", 75000.0, "Ireland"),
    (14, "Charlotte Thomas", "Female", 73500.0, "New Zealand"),
    (15, "Jackson White", "Male", 70000.0, "Australia"),
    (14, "Charlotte Thomas", "Female", 73500.0, "New Zealand"),
    (15, "Jackson White", "Male", 70000.0, "Australia")
]

# Correct schema definition
emp_schema = StructType([
    StructField("empId", IntegerType(), True),
    StructField("empName", StringType(), True),
    StructField("empGender", StringType(), True),
    StructField("empSalary", FloatType(), True),
    StructField("empCountry", StringType(), True)
])

# Create DataFrame with schema
df1 = spark.createDataFrame(emp_data, schema=emp_schema)

df1.show()
df1.printSchema()
print("Row count:", df1.count())


+-----+----------------+---------+---------+-----------+
|empId|         empName|empGender|empSalary| empCountry|
+-----+----------------+---------+---------+-----------+
|    1|        John Doe|     Male|  70000.0|        USA|
|    2|     Emily Clark|   Female|  72000.0|     Canada|
|    3|   Michael Smith|     Male|  68000.0|         UK|
|    4|      Sophia Lee|   Female|  75000.0|  Australia|
|    5|      Daniel Kim|     Male|  69000.0|      India|
|    5|      Daniel Kim|     Male|  69000.0|      India|
|    6|    Olivia Brown|   Female|  71000.0|        USA|
|    7|    James Wilson|     Male|  70500.0|     Canada|
|    8|     Ava Johnson|   Female|  69500.0|    Germany|
|    9|     Ethan Davis|     Male|  73000.0|         UK|
|   10| Isabella Garcia|   Female|  74000.0|        USA|
|    9|     Ethan Davis|     Male|  73000.0|         UK|
|   10| Isabella Garcia|   Female|  74000.0|        USA|
|   11|  Logan Martinez|     Male|  68000.0|     Mexico|
|   12|      Mia Taylor|   Fema

In [6]:
# withColumn() : to add columns
# add two more columns
      #country ==> constant column ==> India
      #tax => derived column ==> 12%(salary)

from pyspark.sql.functions import lit
        
df2 = df1.withColumn("Origin", lit("India")).withColumn("tax",df1.empSalary*0.12)

df2.show(5)


+-----+-------------+---------+---------+----------+------+------+
|empId|      empName|empGender|empSalary|empCountry|Origin|   tax|
+-----+-------------+---------+---------+----------+------+------+
|    1|     John Doe|     Male|  70000.0|       USA| India|8400.0|
|    2|  Emily Clark|   Female|  72000.0|    Canada| India|8640.0|
|    3|Michael Smith|     Male|  68000.0|        UK| India|8160.0|
|    4|   Sophia Lee|   Female|  75000.0| Australia| India|9000.0|
|    5|   Daniel Kim|     Male|  69000.0|     India| India|8280.0|
+-----+-------------+---------+---------+----------+------+------+
only showing top 5 rows



In [7]:
# withColumnRenamed()  : to rename the columns
    #two columns => country, tax
    
df3 = df2.withColumnRenamed("Origin", "empOrigin") \
        .withColumnRenamed("tax", "empTax")

df3.show(5)
# df3.printSchema() #data type

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|     Male|  70000.0|       USA|    India|8400.0|
|    2|  Emily Clark|   Female|  72000.0|    Canada|    India|8640.0|
|    3|Michael Smith|     Male|  68000.0|        UK|    India|8160.0|
|    4|   Sophia Lee|   Female|  75000.0| Australia|    India|9000.0|
|    5|   Daniel Kim|     Male|  69000.0|     India|    India|8280.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [8]:
# way to select the specific columns
from pyspark.sql.functions import col

# df3.select("empId", "empName", col("empGender")).show(5) #both are same

df3.select("empId", df3.empName, col("empGender")).show(5)


+-----+-------------+---------+
|empId|      empName|empGender|
+-----+-------------+---------+
|    1|     John Doe|     Male|
|    2|  Emily Clark|   Female|
|    3|Michael Smith|     Male|
|    4|   Sophia Lee|   Female|
|    5|   Daniel Kim|     Male|
+-----+-------------+---------+
only showing top 5 rows



In [9]:
df3.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|     Male|  70000.0|       USA|    India|8400.0|
|    2|  Emily Clark|   Female|  72000.0|    Canada|    India|8640.0|
|    3|Michael Smith|     Male|  68000.0|        UK|    India|8160.0|
|    4|   Sophia Lee|   Female|  75000.0| Australia|    India|9000.0|
|    5|   Daniel Kim|     Male|  69000.0|     India|    India|8280.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [10]:
# case conditions :

    #male ==> m
    #female ==> f

# df4 = df3.select("empId", 
#                  "empName", 
#                  ("empGender"), 
#                  "empSalary", 
#                  "empCountry", 
#                  "empOrigin", 
#                  "empTax")

# df4.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|     Male|  70000.0|       USA|    India|8400.0|
|    2|  Emily Clark|   Female|  72000.0|    Canada|    India|8640.0|
|    3|Michael Smith|     Male|  68000.0|        UK|    India|8160.0|
|    4|   Sophia Lee|   Female|  75000.0| Australia|    India|9000.0|
|    5|   Daniel Kim|     Male|  69000.0|     India|    India|8280.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [11]:
# case conditions :

    #male ==> m
    #female ==> f
    
from pyspark.sql.functions import when
    
df4 = df3.select("empId", 
                 "empName", 
                 when(df3.empGender == 'Male', 'M').otherwise('F').alias("empGender"), 
                 "empSalary", 
                 "empCountry", 
                 "empOrigin", 
                 "empTax")

df4.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|        M|  70000.0|       USA|    India|8400.0|
|    2|  Emily Clark|        F|  72000.0|    Canada|    India|8640.0|
|    3|Michael Smith|        M|  68000.0|        UK|    India|8160.0|
|    4|   Sophia Lee|        F|  75000.0| Australia|    India|9000.0|
|    5|   Daniel Kim|        M|  69000.0|     India|    India|8280.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [12]:
# orderBy() or sort() : to sort the data

# df4.orderBy(df4.empSalary.desc()).show(5)

# both the statement are same orderby or sort give you same ans

# use desc or asc order in both statement

df4.sort(df4.empSalary).show(5) 


+-----+--------------+---------+---------+----------+---------+------+
|empId|       empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+--------------+---------+---------+----------+---------+------+
|   11|Logan Martinez|        M|  68000.0|    Mexico|    India|8160.0|
|    3| Michael Smith|        M|  68000.0|        UK|    India|8160.0|
|    5|    Daniel Kim|        M|  69000.0|     India|    India|8280.0|
|    5|    Daniel Kim|        M|  69000.0|     India|    India|8280.0|
|    8|   Ava Johnson|        F|  69500.0|   Germany|    India|8340.0|
+-----+--------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [13]:
df4.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|        M|  70000.0|       USA|    India|8400.0|
|    2|  Emily Clark|        F|  72000.0|    Canada|    India|8640.0|
|    3|Michael Smith|        M|  68000.0|        UK|    India|8160.0|
|    4|   Sophia Lee|        F|  75000.0| Australia|    India|9000.0|
|    5|   Daniel Kim|        M|  69000.0|     India|    India|8280.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [16]:
# dropDuplicates() : to remove the duplicates

# df4.dropDuplicates().show()
df4.dropDuplicates().orderBy("empId").show()
df4.dropDuplicates().count()

+-----+----------------+---------+---------+-----------+---------+------+
|empId|         empName|empGender|empSalary| empCountry|empOrigin|empTax|
+-----+----------------+---------+---------+-----------+---------+------+
|    1|        John Doe|        M|  70000.0|        USA|    India|8400.0|
|    2|     Emily Clark|        F|  72000.0|     Canada|    India|8640.0|
|    3|   Michael Smith|        M|  68000.0|         UK|    India|8160.0|
|    4|      Sophia Lee|        F|  75000.0|  Australia|    India|9000.0|
|    5|      Daniel Kim|        M|  69000.0|      India|    India|8280.0|
|    6|    Olivia Brown|        F|  71000.0|        USA|    India|8520.0|
|    7|    James Wilson|        M|  70500.0|     Canada|    India|8460.0|
|    8|     Ava Johnson|        F|  69500.0|    Germany|    India|8340.0|
|    9|     Ethan Davis|        M|  73000.0|         UK|    India|8760.0|
|   10| Isabella Garcia|        F|  74000.0|        USA|    India|8880.0|
|   11|  Logan Martinez|        M|  68

15

In [17]:
df4.show()

+-----+----------------+---------+---------+-----------+---------+------+
|empId|         empName|empGender|empSalary| empCountry|empOrigin|empTax|
+-----+----------------+---------+---------+-----------+---------+------+
|    1|        John Doe|        M|  70000.0|        USA|    India|8400.0|
|    2|     Emily Clark|        F|  72000.0|     Canada|    India|8640.0|
|    3|   Michael Smith|        M|  68000.0|         UK|    India|8160.0|
|    4|      Sophia Lee|        F|  75000.0|  Australia|    India|9000.0|
|    5|      Daniel Kim|        M|  69000.0|      India|    India|8280.0|
|    5|      Daniel Kim|        M|  69000.0|      India|    India|8280.0|
|    6|    Olivia Brown|        F|  71000.0|        USA|    India|8520.0|
|    7|    James Wilson|        M|  70500.0|     Canada|    India|8460.0|
|    8|     Ava Johnson|        F|  69500.0|    Germany|    India|8340.0|
|    9|     Ethan Davis|        M|  73000.0|         UK|    India|8760.0|
|   10| Isabella Garcia|        F|  74

In [23]:
# distinct() : to display the unique records

# df4.distinct().show()

df4.distinct().orderBy("empId").show()

+-----+----------------+---------+---------+-----------+---------+------+
|empId|         empName|empGender|empSalary| empCountry|empOrigin|empTax|
+-----+----------------+---------+---------+-----------+---------+------+
|    1|        John Doe|        M|  70000.0|        USA|    India|8400.0|
|    2|     Emily Clark|        F|  72000.0|     Canada|    India|8640.0|
|    3|   Michael Smith|        M|  68000.0|         UK|    India|8160.0|
|    4|      Sophia Lee|        F|  75000.0|  Australia|    India|9000.0|
|    5|      Daniel Kim|        M|  69000.0|      India|    India|8280.0|
|    6|    Olivia Brown|        F|  71000.0|        USA|    India|8520.0|
|    7|    James Wilson|        M|  70500.0|     Canada|    India|8460.0|
|    8|     Ava Johnson|        F|  69500.0|    Germany|    India|8340.0|
|    9|     Ethan Davis|        M|  73000.0|         UK|    India|8760.0|
|   10| Isabella Garcia|        F|  74000.0|        USA|    India|8880.0|
|   11|  Logan Martinez|        M|  68