In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=c5bdaec8e5fb6ac051a25e4cd3455a9502008e0576c656c01341cdb6dfca7a2e
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Employee Data Analysis") \
    .getOrCreate()

# Sample employee data
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, 'Shalini', 'IT', 90000),
    (4, 'Sneha', 'HR', 50000),
    (5, 'Rahul', 'Finance', 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



Tasks

In [None]:
# 1. **Task 1: Filter Employees by Salary**
#   Filter the employees who have a salary greater than 60,000 and display the result.
#   **Hint**: Use the `filter` method to filter based on the salary column.

filtered_Salary_df = employee_df[employee_df["Salary"] > 60000]
filtered_Salary_df.show()


# 2. **Task 2: Calculate the Average Salary by Department**
#   Group the employees by department and calculate the average salary for each department.
#   **Hint**: Use `groupBy` and `avg` functions.

average_Salary_df = employee_df.groupBy("Department").avg("Salary")
average_Salary_df.show()

# 3. **Task 3: Sort Employees by Salary**
#   Sort the employees in descending order of their salary.
#  **Hint**: Use the `orderBy` function and sort by the `Salary` column.

sorted_Salary_df = employee_df.orderBy(col("Salary").desc())
sorted_Salary_df.show()

# 4. **Task 4: Add a Bonus Column**
#   Add a new column called `Bonus` which should be 10% of the employee's salary.
#   **Hint**: Use `withColumn` to add a new column.

bonus_df = employee_df.withColumn("Bonus", col("Salary") * 0.1)
bonus_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
+----------+------------+----------+------+

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|        HR| 50000|
+----------+------------+----------+------+

+----------+------------+-------

Evening Exercises

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, sum

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

# Show the DataFrames
sales_df1.show()
sales_df2.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
+---------+-----------+-----------+-----+----------+

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



Tasks

In [11]:
# 1. **Union of DataFrames (removing duplicates)**:
#   Combine the two DataFrames (`sales_df1` and `sales_df2`) using `union` and remove any duplicate rows.
union_df = sales_df1.union(sales_df2).dropDuplicates()
print("Union of two dataframes (Remove duplicates): ")
union_df.show()


# 2. **Union of DataFrames (including duplicates)**:
#   Combine both DataFrames using `unionAll` (replaced by `union`) and include duplicate rows.
unionAll_df = sales_df1.unionAll(sales_df2)
print("Union of two dataframes (Include duplicates): ")
unionAll_df.show()



# 3. **Rank products by price within their category**:
#   Use window functions to rank the products in each category by price in descending order.
window_spec = Window.partitionBy("Category").orderBy(col("Price").desc())
ranked_df = unionAll_df.withColumn("Rank", rank().over(window_spec))
print("Rank products by price within their category: ")
ranked_df.show()


# 4. Calculate cumulative price per category
window_spec_cum = Window.partitionBy("Category").orderBy(F.desc("Price")).rowsBetween(Window.unboundedPreceding, Window.currentRow)
cumulative_df = unionAll_df.withColumn("CumulativePrice", F.sum("Price").over(window_spec_cum))
print("Calculate cumulative price per category: ")
cumulative_df.show()

# 5. Convert `SaleDate` from string to date type
date_converted_df = cumulative_df.withColumn("SaleDate", F.to_date("SaleDate", "yyyy-MM-dd"))
print("Convert SaleDate from string to date type: ")
date_converted_df.show()

# 6. Calculate the number of days since each sale
current_date = F.current_date()
days_since_sale_df = date_converted_df.withColumn("DaysSinceSale", F.datediff(current_date, "SaleDate"))
print("Calculate the number of days since each sale: ")
days_since_sale_df.show()

# 7. Add a column for the next sale deadline (30 days after the SaleDate)
next_sale_deadline_df = days_since_sale_df.withColumn("NextSaleDeadline", F.date_add("SaleDate", 30))
print("Add a column for the next sale deadline (30 days after the SaleDate): ")
next_sale_deadline_df.show()

# 8. Calculate total revenue and average price per category
revenue_avg_df = unionAll_df.groupBy("Category").agg(
    F.sum("Price").alias("TotalRevenue"),
    F.avg("Price").alias("AveragePrice")
)
print("Calculate total revenue and average price per category: ")
revenue_avg_df.show()

# 9. Convert all product names to lowercase
lowercase_names_df = next_sale_deadline_df.withColumn("ProductNameLower", F.lower("ProductName"))
print("Convert all product names to lowercase: ")
lowercase_names_df.show()




Union of two dataframes (Remove duplicates): 
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+

Union of two dataframes (Include duplicates): 
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|    