<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark/Pyspark_Advanced_day_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark=SparkSession.builder.appName('Advanced DataFrame Operations').getOrCreate()
data1=[
    (1,'Arjun','IT',75000,'2022-01-15'),
    (2,'Vijay','Finance',85000,'2022-03-12'),
    (3,'Shalini','IT',90000,'2021-06-30')
]
data2=[
    (4,'Sneha','HR',50000,'2022-05-01'),
    (5,'Rahul','Finance',60000,'2022-08-20'),
    (6,'Amit','IT',55000,'2021-12-15')
]
df1=spark.createDataFrame(data1,['emp_id','emp_name','dept','salary','join_date'])
df2=spark.createDataFrame(data2,['emp_id','emp_name','dept','salary','join_date'])
df1.show()
df2.show()

#Union of two DataFrames (remove duplicates)
union_df=df1.union(df2).dropDuplicates()
union_df.show()

#Union of two DataFrames (including duplicates)
union_all_df=df1.union(df2)
union_all_df.show()

from pyspark.sql.functions import col,rank

#Define a window specification to rank employees by salary within each department
window_spec=Window.partitionBy('dept').orderBy(col("Salary").desc())

#Add a rank column to the DataFrame
ranked_df=union_all_df.withColumn('rank',rank().over(window_spec))
ranked_df.show()


+------+--------+-------+------+----------+
|emp_id|emp_name|   dept|salary| join_date|
+------+--------+-------+------+----------+
|     1|   Arjun|     IT| 75000|2022-01-15|
|     2|   Vijay|Finance| 85000|2022-03-12|
|     3| Shalini|     IT| 90000|2021-06-30|
+------+--------+-------+------+----------+

+------+--------+-------+------+----------+
|emp_id|emp_name|   dept|salary| join_date|
+------+--------+-------+------+----------+
|     4|   Sneha|     HR| 50000|2022-05-01|
|     5|   Rahul|Finance| 60000|2022-08-20|
|     6|    Amit|     IT| 55000|2021-12-15|
+------+--------+-------+------+----------+

+------+--------+-------+------+----------+
|emp_id|emp_name|   dept|salary| join_date|
+------+--------+-------+------+----------+
|     1|   Arjun|     IT| 75000|2022-01-15|
|     2|   Vijay|Finance| 85000|2022-03-12|
|     3| Shalini|     IT| 90000|2021-06-30|
|     4|   Sneha|     HR| 50000|2022-05-01|
|     5|   Rahul|Finance| 60000|2022-08-20|
|     6|    Amit|     IT| 5500

In [8]:
#Define a window specification for cumulative sum of salaries within each department
window_spec_sum=Window.partitionBy('dept').orderBy('join_date').rowsBetween(Window.unboundedPreceding ,Window.currentRow)

from pyspark.sql.functions import sum
#Calculate the running total of salries
running_total_df=union_all_df.withColumn("RunningTotal",sum(col('salary')).over(window_spec_sum))
running_total_df.show()


+------+--------+-------+------+----------+------------+
|emp_id|emp_name|   dept|salary| join_date|RunningTotal|
+------+--------+-------+------+----------+------------+
|     2|   Vijay|Finance| 85000|2022-03-12|       85000|
|     5|   Rahul|Finance| 60000|2022-08-20|      145000|
|     4|   Sneha|     HR| 50000|2022-05-01|       50000|
|     3| Shalini|     IT| 90000|2021-06-30|       90000|
|     6|    Amit|     IT| 55000|2021-12-15|      145000|
|     1|   Arjun|     IT| 75000|2022-01-15|      220000|
+------+--------+-------+------+----------+------------+



In [9]:
#Convert JoiningDate from string to data type
date_converted_df=union_all_df.withColumn("join_date",F.to_date(col("join_date"),"yyyy-MM-dd"))
date_converted_df.show()

#calculate the number of years since joining
experience_df=date_converted_df.withColumn("years_since_joining",F.round(F.datediff(F.current_date(),col("join_date"))/365,2))

experience_df.show()


#

+------+--------+-------+------+----------+
|emp_id|emp_name|   dept|salary| join_date|
+------+--------+-------+------+----------+
|     1|   Arjun|     IT| 75000|2022-01-15|
|     2|   Vijay|Finance| 85000|2022-03-12|
|     3| Shalini|     IT| 90000|2021-06-30|
|     4|   Sneha|     HR| 50000|2022-05-01|
|     5|   Rahul|Finance| 60000|2022-08-20|
|     6|    Amit|     IT| 55000|2021-12-15|
+------+--------+-------+------+----------+

+------+--------+-------+------+----------+-------------------+
|emp_id|emp_name|   dept|salary| join_date|years_since_joining|
+------+--------+-------+------+----------+-------------------+
|     1|   Arjun|     IT| 75000|2022-01-15|               2.64|
|     2|   Vijay|Finance| 85000|2022-03-12|               2.48|
|     3| Shalini|     IT| 90000|2021-06-30|               3.18|
|     4|   Sneha|     HR| 50000|2022-05-01|               2.35|
|     5|   Rahul|Finance| 60000|2022-08-20|               2.04|
|     6|    Amit|     IT| 55000|2021-12-15|    

In [10]:
#Add a new column for next evaluation date(one year after joining)
eval_date_df=date_converted_df.withColumn("next_evaluation_date",F.date_add(col("join_date"),365))
eval_date_df.show()

#Calculate average salary per department
avg_salary_df=union_all_df.groupBy("dept").agg(F.round(F.avg("salary"),2).alias("avg_salary"))
avg_salary_df.show()

#Calculate the total number of employees
total_employees_df=union_all_df.agg(F.count("*").alias("total_employees"))
total_employees_df.show()

#Convert employee names to uppercase
upper_name_df=union_all_df.withColumn("emp_name",F.upper(col("emp_name")))
upper_name_df.show()

+------+--------+-------+------+----------+--------------------+
|emp_id|emp_name|   dept|salary| join_date|next_evaluation_date|
+------+--------+-------+------+----------+--------------------+
|     1|   Arjun|     IT| 75000|2022-01-15|          2023-01-15|
|     2|   Vijay|Finance| 85000|2022-03-12|          2023-03-12|
|     3| Shalini|     IT| 90000|2021-06-30|          2022-06-30|
|     4|   Sneha|     HR| 50000|2022-05-01|          2023-05-01|
|     5|   Rahul|Finance| 60000|2022-08-20|          2023-08-20|
|     6|    Amit|     IT| 55000|2021-12-15|          2022-12-15|
+------+--------+-------+------+----------+--------------------+

+-------+----------+
|   dept|avg_salary|
+-------+----------+
|     IT|  73333.33|
|Finance|   72500.0|
|     HR|   50000.0|
+-------+----------+

+---------------+
|total_employees|
+---------------+
|              6|
+---------------+

+------+--------+-------+------+----------+
|emp_id|emp_name|   dept|salary| join_date|
+------+--------+-----