In [None]:
pip install pyspark



In [None]:
# ### **Exercise: Product Sales Analysis**

# #### **Step 1: Create DataFrames**

# You will create two DataFrames: one for products and another for sales transactions. Then, you’ll perform operations like joining these DataFrames and analyzing the data.

# ```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

# Define schema for DataFrames
product_columns = ["ProductID", "ProductName", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]

# Create DataFrames
product_df = spark.createDataFrame(products, schema=product_columns)
sales_df = spark.createDataFrame(sales, schema=sales_columns)

# Show the DataFrames
print("Products DataFrame:")
product_df.show()

print("Sales DataFrame:")
sales_df.show()
# ```

# #### **Step 2: Perform the Following Tasks**

# 1. **Join the DataFrames:**
#    - Join the `product_df` and `sales_df` DataFrames on `ProductID` to create a combined DataFrame with product and sales data.
product_and_sales_df = product_df.join(sales_df, on="ProductID")
print("Product and Sales DataFrame:")
product_and_sales_df.show()

# 2. **Calculate Total Sales Value:**
#    - For each product, calculate the total sales value by multiplying the price by the quantity sold.
total_sales_df = product_and_sales_df.withColumn("TotalSales", col("Price") * col("Quantity"))
print("Total Sales DataFrame:")
total_sales_df.show()

# 3. **Find the Total Sales for Each Product Category:**
#    - Group the data by the `Category` column and calculate the total sales value for each product category.
total_sales_by_category_df = total_sales_df.groupBy("Category").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesCategory")
print("Total Sales by Product Category:")
total_sales_by_category_df.show()

# 4. **Identify the Top-Selling Product:**
#    - Find the product that generated the highest total sales value.
top_selling_product_df = total_sales_df.groupBy("ProductName").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesProduct")
top_selling_product_df

# 5. **Sort the Products by Total Sales Value:**
#    - Sort the products by total sales value in descending order.
sorted_products_df = top_selling_product_df.orderBy(col("TotalSalesProduct").desc())
print("Sorted Products by Total Sales Value:")
sorted_products_df.show()

# 6. **Count the Number of Sales for Each Product:**
#    - Count the number of sales transactions for each product.
sales_count_df = total_sales_df.groupBy("ProductName").count().withColumnRenamed("count", "SalesCount")
print("Number of Sales for Each Product:")
sales_count_df.show()

# 7. **Filter the Products with Total Sales Value Greater Than ₹50,000:**
#    - Filter out the products that have a total sales value greater than ₹50,000.
high_value_products_df = total_sales_df.filter(col("TotalSales") > 50000)
print("Products with Total Sales Value Greater Than ₹50,000:")
high_value_products_df.show()

Products DataFrame:
+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|        1|     Laptop|Electronics|50000|
|        2| Smartphone|Electronics|30000|
|        3|      Table|  Furniture|15000|
|        4|      Chair|  Furniture| 5000|
|        5| Headphones|Electronics| 2000|
+---------+-----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+

Product and Sales DataFrame:
+---------+-----------+-----------+-----+------+--------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|
+---------+-----------+-----------+-----+------+--------+
|        1|     Laptop|Electronics|50000|   

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count

# Initialize the SparkSession
spark = SparkSession.builder \
    .appName("Sales Dataset Analysis") \
    .getOrCreate()

# Load the CSV file into a PySpark DataFrame
df = spark.read.csv("sales_data.csv", header=True, inferSchema=True)

# Step 3: Explore the Data
print("DataFrame Schema:")
df.printSchema()

print("\nFirst 5 rows of the DataFrame:")
df.show(5)

print("\nSummary Statistics for Numeric Columns:")
df.select("Quantity", "Price").summary().show()

# Step 4: Perform Data Transformations and Analysis

# 1. Calculate the Total Sales Value for Each Transaction
df_with_total = df.withColumn("TotalSales", col("Quantity") * col("Price"))
print("\nDataFrame with Total Sales:")
df_with_total.show()

# 2. Group By ProductID and Calculate Total Sales Per Product
sales_by_product = df_with_total.groupBy("ProductID") \
    .agg(sum("TotalSales").alias("TotalProductSales"))
print("\nTotal Sales by Product:")
sales_by_product.show()

# 3. Identify the Top-Selling Product
top_selling_product = sales_by_product.orderBy(col("TotalProductSales").desc()).first()
print(f"\nTop-Selling Product: ProductID {top_selling_product['ProductID']} with total sales of {top_selling_product['TotalProductSales']}")

# 4. Calculate the Total Sales by Date
sales_by_date = df_with_total.groupBy("Date") \
    .agg(sum("TotalSales").alias("DailySales"))
print("\nTotal Sales by Date:")
sales_by_date.show()

# 5. Filter High-Value Transactions
high_value_transactions = df_with_total.filter(col("TotalSales") > 500)
print("\nHigh-Value Transactions (Total Sales > 500):")
high_value_transactions.show()

# Additional Challenges

# 1. Identify Repeat Customers
repeat_customers = df.groupBy("CustomerID") \
    .agg(count("TransactionID").alias("PurchaseCount")) \
    .filter(col("PurchaseCount") > 1)
print("\nRepeat Customers (More than one purchase):")
repeat_customers.show()

# 2. Calculate the Average Sale Price Per Product
avg_price_per_product = df.groupBy("ProductID") \
    .agg(avg("Price").alias("AveragePrice"))
print("\nAverage Sale Price Per Product:")
avg_price_per_product.show()

DataFrame Schema:
root
 |-- TransactionID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Date: date (nullable = true)


First 5 rows of the DataFrame:
+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows


Summary Statistics for Numeric Columns:
+-------+-----------------+-----------------+
|summary|         Quan

In [None]:
from pyspark.sql import SparkSession
from operator import add

# Initialize SparkSession and SparkContext
spark = SparkSession.builder.appName("Key-Value Pair RDD Exercise").getOrCreate()
sc = spark.sparkContext

# Dataset
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

# Step 2: Create and Explore the RDD
# Task 1: Create an RDD from the Sales Data
sales_rdd = sc.parallelize(sales_data)
print("First few elements of the RDD:")
print(sales_rdd.take(3))

# Step 3: Grouping and Aggregating Data
# Task 2: Group Data by Product Name
grouped_sales = sales_rdd.groupByKey()
print("\nGrouped data structure:")
print(grouped_sales.mapValues(list).collect())

# Task 3: Calculate Total Sales by Product
total_sales = sales_rdd.reduceByKey(add)
print("\nTotal sales for each product:")
print(total_sales.collect())

# Task 4: Sort Products by Total Sales
sorted_sales = total_sales.sortBy(lambda x: x[1], ascending=False)
print("\nSorted list of products by sales:")
print(sorted_sales.collect())

# Step 4: Additional Transformations
# Task 5: Filter Products with High Sales
high_sales = total_sales.filter(lambda x: x[1] > 200)
print("\nProducts with sales greater than 200:")
print(high_sales.collect())

# Task 6: Combine Regional Sales Data
regional_rdd = sc.parallelize(regional_sales_data)
combined_sales = sales_rdd.union(regional_rdd)
new_total_sales = combined_sales.reduceByKey(add)
print("\nCombined sales data:")
print(new_total_sales.collect())

# Step 5: Perform Actions on the RDD
# Task 7: Count the Number of Distinct Products
distinct_products = sales_rdd.keys().distinct().count()
print(f"\nNumber of distinct products: {distinct_products}")

# Task 8: Identify the Product with Maximum Sales
max_sales_product = new_total_sales.reduce(lambda x, y: x if x[1] > y[1] else y)
print(f"\nProduct with maximum sales: {max_sales_product[0]}, Amount: {max_sales_product[1]}")

# Challenge Task: Calculate the Average Sales per Product
sales_count = sales_rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
avg_sales = sales_count.mapValues(lambda x: x[0] / x[1])
print("\nAverage sales for each product:")
print(avg_sales.collect())

First few elements of the RDD:
[('ProductA', 100), ('ProductB', 150), ('ProductA', 200)]

Grouped data structure:
[('ProductA', [100, 200]), ('ProductB', [150, 250]), ('ProductC', [300, 100])]

Total sales for each product:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]

Sorted list of products by sales:
[('ProductB', 400), ('ProductC', 400), ('ProductA', 300)]

Products with sales greater than 200:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]

Combined sales data:
[('ProductA', 350), ('ProductC', 550), ('ProductB', 400)]

Number of distinct products: 3

Product with maximum sales: ProductC, Amount: 550

Average sales for each product:
[('ProductA', 150.0), ('ProductB', 200.0), ('ProductC', 200.0)]
