In [0]:
data = [(1,"Saddam Tamboli","Data Engineering",95000,"2021-01-12",5,"Pune"),(2,"Aditya Bhandari","Business Intelligence",105000,"2022-04-23",4,"Mumbai"),(3,"Avishkar Dhumal","Business Analytics",85000,"2024-02-5",2,"Pune"),(4,"Sanket Gugale","Pharamcist",115000,"2020-01-01",3,"Ahmednagar"),(5,"Tanmay Lunia","Contractor",195000,"2018-01-22",1,"Delhi")]

column = ["Employee_ID","Employee_Name","Designation","Salary","Date","Manager_ID","Location"]

df =  spark.createDataFrame(data,column)

In [0]:
df.display()

In [0]:
from pyspark.sql.functions import sum
from pyspark.sql.window import Window
df_sum = df.withColumn("cum_sum",sum("Salary").over(Window.orderBy("Salary")))

df_sum.display()

In [0]:
from pyspark.sql.functions import split ,col ,lit ,get ,when ,size
df_name = df.withColumn("First_Name",split(col("Employee_Name")," ").getItem(0))\
            .withColumn("Last_Name",split(col("Employee_Name")," ").getItem(1))\
            .withColumn("Middle_Name",lit("Unknown"))

df_name.display()


In [0]:
from pyspark.sql.functions import concat_ws ,col
df_concat = df_name.withColumn("Full_Name", concat_ws(" ",col("First_Name"),col("Middle_Name"),col("Last_Name")).alias("Full_Name"))
df_concat.display()

                               

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank ,dense_rank ,desc

In [0]:
window_sp =  Window.orderBy(desc("Salary"))
df_1 =  df.withColumn("rank", rank().over(window_sp))
df_salary =  df_1.filter(df_1.rank == 3)
df_salary.display()

In [0]:
window_spec =  Window.partitionBy("Location").orderBy(desc("Salary"))
df_grp = df.withColumn("rank", rank().over(window_spec))
df_grp_1 = df_grp.filter(df_grp.rank == 1)
df_grp_1.display()

In [0]:
from pyspark.sql.functions import avg,count

df_gr = df.groupBy("Designation").agg(avg("Salary").alias("avg_salary"),count("Salary").alias("count"))
df_gr.display()

In [0]:
dept_data = [(1,"Data Engineering","Vishal Verma","Bangalore"),(2,"Business Intelligence","Ravi Kumar","Mumbai"),(3,"Business Analytics","Rajesh Kumar","Pune"),(4,"Pharamcist","Rakesh Kumar","Delhi"),(5,"Contractor","Rajesh Kumar","Ahmednagar")]

dept_column = ["Dept_ID","Dept_Name","Manager_Name","Location"]

department_df = spark.createDataFrame(dept_data,dept_column)
department_df.display()

In [0]:
dbutils.widgets.text("Designation_text", "")
designation = dbutils.widgets.get("Designation_text")
filter_df = df.filter(df.Designation == designation)
filter_df.display()



In [0]:
dbutils.widgets.dropdown("Designation_dropdown", "Data Engineering", ["Data Engineering", "Business Intelligence", "Business Analytics", "Pharamcist", "Contractor"])
designation_dropdown = dbutils.widgets.get("Designation_dropdown")
filter_df = df.filter(df.Designation == designation_dropdown)
filter_df.display()

In [0]:
dbutils.widgets.multiselect("Designation_multiselect", "Data Engineering", ["Data Engineering", "Business Intelligence", "Business Analytics", "Pharamcist", "Contractor"])
designation_multiselect = dbutils.widgets.get("Designation_multiselect")
designation_list = designation_multiselect.split(",")
filtered_df = df.filter(df.Designation.isin(designation_list))
filtered_df.display()

In [0]:
df.display()

In [0]:
df_join_dept = df.join(department_df,col("Manager_ID") == col("Dept_ID"),"inner")
df_join_dept.display()

In [0]:
df.write.mode("overwrite").saveAsTable("employee")
display(spark.sql("select * from employee"))

In [0]:
%sql
desc history employee;

In [0]:
spark.sql("drop table if exists employee")

df.write.mode("overwrite").partitionBy("Location").saveAsTable("employee")


In [0]:
dbutils.notebook.exit("Success")