In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=2fc76b7d43c0c558bbba97f4a71851e8750e1594e11503a20ac9c132ae1bc20c
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession

spark=SparkSession.builder \
  .appName("RDD Transformation") \
  .getOrCreate()

sc=spark.sparkContext

In [None]:
data=[1,2,3,4,5,6,7,8,9,10]
rdd=sc.parallelize(data)
print("Original rdd:",rdd.collect())

Original rdd: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
rdd2=rdd.map(lambda x: x*2)
print("rdd after multiplication:",rdd2.collect())

rdd after multiplication: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


In [None]:
rdd3=rdd.filter(lambda x: x%2==0)
print("rdd with only even numbers:",rdd3.collect())

rdd with only even numbers: [2, 4, 6, 8, 10]


In [None]:
sentences=["Hello world","Pyspark is great","RDD transformations are explained"]
rdd4=sc.parallelize(sentences)
words_rdd=rdd4.flatMap(lambda sentence: sentence.split(" "))
print("rdd after flatmap:",words_rdd.collect())

rdd after flatmap: ['Hello', 'world', 'Pyspark', 'is', 'great', 'RDD', 'transformations', 'are', 'explained']


In [None]:
results=rdd3.collect()
print(results)

[2, 4, 6, 8, 10]


In [None]:
count=rdd3.count()
print("Number of elements:", count)

Number of elements: 5


In [None]:
total=rdd.reduce(lambda x,y:x+y)
print("Total sum:",total)

Total sum: 55


In [None]:
#1. Initialize SparkSession and SparkContext:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Key-Value Pair RDDs Exercise") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
#Task 1: Create an RDD from the Sales Data
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

sales_rdd = sc.parallelize(sales_data)

print("Sales RDD:")
print(sales_rdd.collect())

Sales RDD:
[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300), ('ProductB', 250), ('ProductC', 100)]


In [None]:
#Task 2: Group Data by Product Name
grouped_sales_rdd = sales_rdd.groupByKey()
grouped_sales_data = grouped_sales_rdd.mapValues(list).collect()
print("Grouped Sales RDD:")
for product, sales in grouped_sales_data:
    print(f"{product}: {sales}")


Grouped Sales RDD:
ProductA: [100, 200]
ProductB: [150, 250]
ProductC: [300, 100]


In [None]:
#Task 3: Calculate Total Sales by Product
total_sales_rdd = sales_rdd.reduceByKey(lambda a, b: a + b)
print("Total Sales by Product:")
print(total_sales_rdd.collect())


Total Sales by Product:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]


In [None]:
#Task 4: Sort Products by Total Sales
sorted_sales_rdd = total_sales_rdd.sortBy(lambda x: x[1], ascending=False)
print("Sorted Products by Total Sales:")
print(sorted_sales_rdd.collect())


Sorted Products by Total Sales:
[('ProductB', 400), ('ProductC', 400), ('ProductA', 300)]


In [None]:
#Task 5: Filter Products with High Sales
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)
print("Products with Sales Greater than 200:")
print(high_sales_rdd.collect())


Products with Sales Greater than 200:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]


In [None]:
#Task 6: Combine Regional Sales Data
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]
regional_sales_rdd = sc.parallelize(regional_sales_data)

combined_sales_rdd = sales_rdd.union(regional_sales_rdd)

combined_total_sales_rdd = combined_sales_rdd.reduceByKey(lambda a, b: a + b)
print("Combined Sales Data:")
print(combined_total_sales_rdd.collect())


Combined Sales Data:
[('ProductA', 350), ('ProductC', 550), ('ProductB', 400)]


In [None]:
#Task 7: Count the Number of Distinct Products
distinct_product_count = combined_total_sales_rdd.count()
print(f"Number of Distinct Products: {distinct_product_count}")


Number of Distinct Products: 3


In [None]:
#Task 8: Identify the Product with Maximum Sales
max_sales_product = combined_total_sales_rdd.reduce(lambda x, y: x if x[1] > y[1] else y)
print(f"Product with Maximum Sales: {max_sales_product}")


Product with Maximum Sales: ('ProductC', 550)


In [None]:
#Challenge Task: Calculate the Average Sales per Product
grouped_sales_rdd = combined_sales_rdd.groupByKey()
average_sales_rdd = grouped_sales_rdd.mapValues(lambda sales: sum(sales) / len(sales))
print("Average Sales per Product:")
print(average_sales_rdd.collect())



Average Sales per Product:
[('ProductA', 116.66666666666667), ('ProductC', 183.33333333333334), ('ProductB', 200.0)]


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Employee Data Analysis") \
    .getOrCreate()

# Sample employee data
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, 'Shalini', 'IT', 90000),
    (4, 'Sneha', 'HR', 50000),
    (5, 'Rahul', 'Finance', 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [None]:
# Task 1: Filter Employees by Salary
filtered_df = employee_df.filter(col('Salary') > 60000)
filtered_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
+----------+------------+----------+------+



In [None]:
# Task 2: Calculate the Average Salary by Department
from pyspark.sql.functions import avg
avg_salary_df = employee_df.groupBy('Department').agg(avg('Salary').alias('AverageSalary'))
avg_salary_df.show()

+----------+-----------------+
|Department|    AverageSalary|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+



In [None]:
# Task 3: Sort Employees by Salary
sorted_df = employee_df.orderBy(col('Salary').desc())
sorted_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|        HR| 50000|
+----------+------------+----------+------+



In [None]:
# Task 4: Add a Bonus Column
from pyspark.sql.functions import expr
employee_with_bonus_df = employee_df.withColumn('Bonus', col('Salary') * 0.10)
employee_with_bonus_df.show()

+----------+------------+----------+------+------+
|EmployeeID|EmployeeName|Department|Salary| Bonus|
+----------+------------+----------+------+------+
|         1|       Arjun|        IT| 75000|7500.0|
|         2|       Vijay|   Finance| 85000|8500.0|
|         3|     Shalini|        IT| 90000|9000.0|
|         4|       Sneha|        HR| 50000|5000.0|
|         5|       Rahul|   Finance| 60000|6000.0|
|         6|        Amit|        IT| 55000|5500.0|
+----------+------------+----------+------+------+



In [None]:
# Convert the entire DataFrame to CSV
employee_df.write.csv('employee_data1.csv', header=True)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark=SparkSession.builder \
.appName("Employee Data Handling") \
.getOrCreate ()

data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, None, 'IT', 90000),
    (4, 'Sneha', 'HR', None),
    (5, 'Rahul', None, 60000),
    (6, 'Amit', 'IT', 55000)
]

columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

employee_df =spark.createDataFrame (data, columns)

# Show the DataFrame
employee_df.show()

# Fill null values in 'EmployeeName' and 'Department' with 'Unknown'
filled_df= employee_df.fillna({'EmployeeName': 'Unknown', 'Department': 'Unknown'})
filled_df.show()

# Drop rows where 'Salary' is null
dropped_null_salary_df = employee_df.dropna (subset=['Salary'])
dropped_null_salary_df.show()

# Fill null values in 'Salary' with 50000
salary_filled_df=employee_df.fillna({'Salary': 50000})
salary_filled_df.show()

# Check for null values in the entire DataFrame
null_counts= employee_df.select([col(c).isNull().alias (c) for c in employee_df.columns]).show()

# Replace all null values in the DataFrame with 'N/A'
na_filled_df = employee_df.na.fill('N/A')
na_filled_df.show()



+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Unknown|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|   Unknown| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+-----

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# Initialize a Spark session
spark= SparkSession.builder \
.appName("Advanced DataFrame Operations") \
.getOrCreate ()
# Create two sample DataFrames
datal = [
    (1, 'Arjun', 'IT', 75000, '2022-01-15'), (2, 'Vijay', 'Finance', 85000, '2022-03-12'), (3, 'Shalini', 'IT', 90000, '2021-06-30')
]
data2 = [
(4, 'Sneha', 'HR', 50000, '2022-05-01'), (5, 'Rahul', 'Finance', 60000, '2022-08-20'), (6, 'Amit', 'IT', 55000, '2021-12-15')
]
#Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary', 'JoiningDate']
#Create DataFrames
employee_df1 = spark.createDataFrame (datal, columns)
employee_df2 = spark.createDataFrame (data2, columns)

#Union of two DataFrames (removes duplicates)
union_df=employee_df1.union (employee_df2).dropDuplicates()
union_df.show()
#Union of two DataFrames (includes duplicates)
union_all_df= employee_df1.union(employee_df2)
union_all_df.show()

from pyspark.sql.window import Window
from pyspark.sql.functions import rank
#Define a window specification to rank employees by salary within each department
window_spec=Window.partitionBy("Department").orderBy (col ("Salary").desc())
#Add a rank column to the DataFrame
ranked_df= union_all_df.withColumn ("Rank", rank().over (window_spec))
ranked_df.show()
from pyspark.sql.functions import sum
#Define a window specification for cumulative sum of salaries within each department
window_spec_sum= Window.partitionBy("Department").orderBy("JoiningDate").rowsBetween (Window.unboundedPreceding, Window.currentRow)

#Calculate the running total of salaries
window_spec_sum = Window.orderBy("EmployeeID").rowsBetween(Window.unboundedPreceding, Window.currentRow)
running_total_df = union_all_df.withColumn("RunningTotal", sum(col("Salary")).over(window_spec_sum))
running_total_df.show()
#Convert JoiningDate from string to date type
date_converted_df = union_all_df.withColumn("JoiningDate", F.to_date(col("JoiningDate"), "yyyy-MM-dd"))
date_converted_df.show()
#Calculate the number of years since joining
experience_df=date_converted_df.withColumn ("YearsOfExperience", F.round (F.datediff (F.current_date(), col ("JoiningDate")) / 365, 2))
experience_df.show()
#Add a new column for next evaluation date (one year after joining)
eval_date_df =date_converted_df.withColumn ("NextEvaluationDate", F.date_add(col ("JoiningDate"), 365))
eval_date_df.show()
#Calculate average salary per department
avg_salary_df= union_all_df.groupBy("Department").agg(F.avg("Salary").alias("AverageSalary"))
avg_salary_df.show()
#Calculate the total number of employees.
total_employees_df= union_all_df.agg(F.count("EmployeeID").alias ("TotalEmployees"))
total_employees_df.show()
#Convert employee names to uppercase
upper_name_df =union_all_df.withColumn ("EmployeeNameUpper", F.upper (col("EmployeeName")))
upper_name_df.show()

+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         3|     Shalini|        IT| 90000| 2021-06-30|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+

+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 202

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

In [None]:
# Task 1: Union of DataFrames and remove duplicates
union_df_distinct = sales_df1.union(sales_df2).distinct()
union_df_distinct.show()


+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+



In [None]:
# Task 2: Union of DataFrames including duplicates
union_df_all = sales_df1.union(sales_df2)
union_df_all.show()


+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



In [None]:
#Task 3: Rank Products by Price Within Their Category
window_spec_rank = Window.partitionBy("Category").orderBy(F.col("Price").desc())
ranked_df = union_df_all.withColumn("Rank", F.rank().over(window_spec_rank))
ranked_df.show()

+---------+-----------+-----------+-----+----------+----+
|ProductID|ProductName|   Category|Price|  SaleDate|Rank|
+---------+-----------+-----------+-----+----------+----+
|        5|  Product E|   Clothing|  800|2022-09-12|   1|
|        2|  Product B|   Clothing|  500|2022-07-15|   2|
|        3|  Product C|Electronics| 1800|2021-11-05|   1|
|        6|  Product F|Electronics| 1500|2021-10-19|   2|
|        1|  Product A|Electronics| 1200|2022-05-10|   3|
|        4|  Product D|  Furniture| 3000|2022-03-25|   1|
+---------+-----------+-----------+-----+----------+----+



In [None]:
#Task 4: Calculate Cumulative Price per Category
window_spec_cum_sum = Window.partitionBy("Category").orderBy("Price")
cumulative_df = union_df_all.withColumn("CumulativePrice", F.sum("Price").over(window_spec_cum_sum))
cumulative_df.show()

+---------+-----------+-----------+-----+----------+---------------+
|ProductID|ProductName|   Category|Price|  SaleDate|CumulativePrice|
+---------+-----------+-----------+-----+----------+---------------+
|        2|  Product B|   Clothing|  500|2022-07-15|            500|
|        5|  Product E|   Clothing|  800|2022-09-12|           1300|
|        1|  Product A|Electronics| 1200|2022-05-10|           1200|
|        6|  Product F|Electronics| 1500|2021-10-19|           2700|
|        3|  Product C|Electronics| 1800|2021-11-05|           4500|
|        4|  Product D|  Furniture| 3000|2022-03-25|           3000|
+---------+-----------+-----------+-----+----------+---------------+



In [None]:
#Task 5: Convert SaleDate from String to Date Type
date_converted_df = union_df_all.withColumn("SaleDate", F.to_date("SaleDate", "yyyy-MM-dd"))
date_converted_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



In [None]:
#Task 6: Calculate the Number of Days Since Each Sale
days_since_sale_df = date_converted_df.withColumn("DaysSinceSale", F.datediff(F.current_date(), "SaleDate"))
days_since_sale_df.show()

+---------+-----------+-----------+-----+----------+-------------+
|ProductID|ProductName|   Category|Price|  SaleDate|DaysSinceSale|
+---------+-----------+-----------+-----+----------+-------------+
|        1|  Product A|Electronics| 1200|2022-05-10|          848|
|        2|  Product B|   Clothing|  500|2022-07-15|          782|
|        3|  Product C|Electronics| 1800|2021-11-05|         1034|
|        4|  Product D|  Furniture| 3000|2022-03-25|          894|
|        5|  Product E|   Clothing|  800|2022-09-12|          723|
|        6|  Product F|Electronics| 1500|2021-10-19|         1051|
+---------+-----------+-----------+-----+----------+-------------+



In [None]:
#Task 7: Add a Column for the Next Sale Deadline
next_sale_deadline_df = date_converted_df.withColumn("NextSaleDeadline",F.date_add("SaleDate", 30))
next_sale_deadline_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|NextSaleDeadline|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|      2022-06-09|
|        2|  Product B|   Clothing|  500|2022-07-15|      2022-08-14|
|        3|  Product C|Electronics| 1800|2021-11-05|      2021-12-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|      2022-04-24|
|        5|  Product E|   Clothing|  800|2022-09-12|      2022-10-12|
|        6|  Product F|Electronics| 1500|2021-10-19|      2021-11-18|
+---------+-----------+-----------+-----+----------+----------------+



In [None]:
#Task 8: Calculate Total Revenue and Average Price per Category
revenue_avg_df = union_df_all.groupBy("Category").agg(F.sum("Price").alias("TotalRevenue"),F.avg("Price").alias("AveragePrice"))
revenue_avg_df.show()

+-----------+------------+------------+
|   Category|TotalRevenue|AveragePrice|
+-----------+------------+------------+
|Electronics|        4500|      1500.0|
|   Clothing|        1300|       650.0|
|  Furniture|        3000|      3000.0|
+-----------+------------+------------+



In [None]:
#Task 9: Convert All Product Names to Lowercase
lowercase_names_df = union_df_all.withColumn("ProductNameLower", F.lower("ProductName"))
lowercase_names_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|ProductNameLower|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|       product a|
|        2|  Product B|   Clothing|  500|2022-07-15|       product b|
|        3|  Product C|Electronics| 1800|2021-11-05|       product c|
|        4|  Product D|  Furniture| 3000|2022-03-25|       product d|
|        5|  Product E|   Clothing|  800|2022-09-12|       product e|
|        6|  Product F|Electronics| 1500|2021-10-19|       product f|
+---------+-----------+-----------+-----+----------+----------------+



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
  .appName("SparkSQLExample")\
  .getOrCreate()


In [None]:
#Full refresh: Load the entire dataset

df_sales =  spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/content/sales_data.csv")

#Apply transformations (if necessary)

df_transformed = df_sales.withColumn("total_sales", df_sales["quantity"] * df_sales["price"])

#Full refresh: Partition the data by 'date' and overwrite the existing data

output_path = "/content/sample_data/partitioned_data"

df_transformed.write.partitionBy("date").mode("overwrite").parquet(output_path)

#Verify partitioned data

partitioned_df = spark.read.parquet(output_path)

partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("IncrementalLoad").getOrCreate()

# Define the last ETL run timestamp
last_etl_run = '2024-09-01 00:00:00'

# Load only new or updated records since the last ETL run
df_incremental = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/content/sample_data/sales_data.csv") \
    .filter(F.col("updated_at") > last_etl_run)

# Apply transformations (if necessary)
df_transformed_incremental = df_incremental.withColumn(
    "total_sales", F.col("quantity") * F.col("price")
)

# Incremental load: Append the new data to the existing partitioned dataset
output_path = "/content/sample_data/partitioned_sales_data"
df_transformed_incremental.write.partitionBy("date").mode("append").parquet(output_path)

# Verify partitioned data after incremental load
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()


+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
! pip install ipywidgets



In [None]:
from pyspark.sql import SparkSession
import ipywidgets as widgets
from IPython.display import display

# Step 1: Initialize a Spark session
spark = SparkSession.builder.appName ("PySpark with Widgets Example").getOrCreate()
# Step 2: Create a simple DataFrame
data = [
("John", 28, "Male", 60000),
 ("Jane", 32, "Female", 72000),
  ("Mike", 45, "Male", 84000),
   ("Emily", 23, "Female", 52000),
    ("Alex", 36, "Male", 67000)
]
df =spark.createDataFrame(data, ["name", "age", "gender", "salary"])
# Show the DataFrame
df.show()
#Step 3: Create widgets
# Dropdown widget to select column for filtering
column_dropdown =widgets.Dropdown(
    options=["age", "salary"],
    value="age",
    description="Filter By:",
)
# Slider widget to choose a value for filtering
slider =widgets. IntSlider(
    value=30,
    min=20,
    max=100,
    step=5,
    description="Threshold:",
    continuous_update=False
)
#Button to trigger filtering
button= widgets. Button (description="Apply Filter")
# Output area to show the results
output= widgets. Output ()
# Display the widgets
display (column_dropdown, slider, button, output)
# Step 4: Define the function to apply filtering based on widget inputs
def apply_filter(b):
  column = column_dropdown.value
  threshold = slider.value
# Clear previous output
  output.clear_output()
# Filter the DataFrame based on widget values
  df_filtered =df.filter(df [column] > threshold)
# Show the filtered DataFrame
  with output:
    print (f"Filtering by (column) > (threshold)")
    df_filtered.show()
#Step 5: Attach the function to the button click event
button.on_click(apply_filter)

+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



Dropdown(description='Filter By:', options=('age', 'salary'), value='age')

IntSlider(value=30, continuous_update=False, description='Threshold:', min=20, step=5)

Button(description='Apply Filter', style=ButtonStyle())

Output()