In [3]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=3b16cf462c296044f687abf3df1d7a46e3f21872a98d8ed83f3c881330cdc0ec
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import Row

# Example data
data = [
    Row(TransactionID=1, CustomerID=101, ProductID=501, Quantity=2, Price=150.0),
    Row(TransactionID=2, CustomerID=102, ProductID=502, Quantity=1, Price=250.0),
    Row(TransactionID=3, CustomerID=103, ProductID=503, Quantity=4, Price=150.0),
    Row(TransactionID=4, CustomerID=104, ProductID=504, Quantity=3, Price=50.0),
    Row(TransactionID=5, CustomerID=105, ProductID=505, Quantity=1, Price=200.0),

]

# Create DataFrame
df = spark.createDataFrame(data)
print("DF created")

DF created


In [None]:
df.show(3)

print("Schema")
df.printSchema()

print("Summary statistics")
df.describe().show()



+-------------+----------+---------+--------+-----+
|TransactionID|CustomerID|ProductID|Quantity|Price|
+-------------+----------+---------+--------+-----+
|            1|       101|      501|       2|150.0|
|            2|       102|      502|       1|250.0|
|            3|       103|      503|       4|150.0|
+-------------+----------+---------+--------+-----+
only showing top 3 rows

Schema
root
 |-- TransactionID: long (nullable = true)
 |-- CustomerID: long (nullable = true)
 |-- ProductID: long (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- Price: double (nullable = true)

Summary statistics
+-------+------------------+------------------+------------------+------------------+-----------------+
|summary|     TransactionID|        CustomerID|         ProductID|          Quantity|            Price|
+-------+------------------+------------------+------------------+------------------+-----------------+
|  count|                 5|                 5|                 5|    

In [None]:
# Selecting data
df.select("CustomerID", "Quantity").show()

# Filtering data
df.filter(df.Quantity > 2).show()

# Aggregating data
df.groupBy("CustomerID").agg({"Quantity": "sum"}).show()

# Renaming columns
df.withColumnRenamed("Quantity", "PurchaseQuantity").show()

# Chaining Operations
df_Chain=df.select("TransactionID", "CustomerID", "Quantity")
df_Chain.filter(df_Chain.Quantity > 2).show()

+----------+--------+
|CustomerID|Quantity|
+----------+--------+
|       101|       2|
|       102|       1|
|       103|       4|
|       104|       3|
|       105|       1|
+----------+--------+

+-------------+----------+---------+--------+-----+
|TransactionID|CustomerID|ProductID|Quantity|Price|
+-------------+----------+---------+--------+-----+
|            3|       103|      503|       4|150.0|
|            4|       104|      504|       3| 50.0|
+-------------+----------+---------+--------+-----+

+----------+-------------+
|CustomerID|sum(Quantity)|
+----------+-------------+
|       101|            2|
|       102|            1|
|       103|            4|
|       104|            3|
|       105|            1|
+----------+-------------+

+-------------+----------+---------+----------------+-----+
|TransactionID|CustomerID|ProductID|PurchaseQuantity|Price|
+-------------+----------+---------+----------------+-----+
|            1|       101|      501|               2|150.0|
|   

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

# Define schema for DataFrames
product_columns = ["ProductID", "ProductName", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]

# Create DataFrames
product_df = spark.createDataFrame(products, schema=product_columns)
sales_df = spark.createDataFrame(sales, schema=sales_columns)

# Show the DataFrames
print("Products DataFrame:")
product_df.show()

print("Sales DataFrame:")
sales_df.show()

Products DataFrame:
+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|        1|     Laptop|Electronics|50000|
|        2| Smartphone|Electronics|30000|
|        3|      Table|  Furniture|15000|
|        4|      Chair|  Furniture| 5000|
|        5| Headphones|Electronics| 2000|
+---------+-----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+



In [7]:
# 1.Join the product_df and sales_df DataFrames on ProductID to create a combined DataFrame with product and sales data.
combined_df = product_df.join(sales_df, on="ProductID")
print("Joined dataframes: ")
combined_df.show()

# 2. For each product, calculate the total sales value by multiplying the price by the quantity sold.
sales_value_df = combined_df.withColumn("TotalSalesValue", col("Price") * col("Quantity"))
print("Total sales value: ")
sales_value_df.show()

# 3.Group the data by the Category column and calculate the total sales value for each product category
category_sales_value_df = sales_value_df.groupBy("Category").agg({"TotalSalesValue": "sum"})
print("Category wise sales value: ")
category_sales_value_df.show()

# 4.Find the product that generated highest sales value
highest_sales_product = sales_value_df.orderBy(col("TotalSalesValue").desc()).limit(1)
print("Product with highest sales value: ")
highest_sales_product.show()


# 5. Sort the products by total sales value in descending order
sorted_products_df = sales_value_df.orderBy(col("TotalSalesValue").desc())
print("Sorted products by total sales value: ")
sorted_products_df.show()

# 6. Count the number of sales for each product
sales_count_df = sales_value_df.groupBy("ProductID","ProductName").agg({"Quantity": "sum"})
print("Number of sales for each product: ")
sales_count_df.show()


Joined dataframes: 
+---------+-----------+-----------+-----+------+--------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|
+---------+-----------+-----------+-----+------+--------+
|        1|     Laptop|Electronics|50000|     1|       2|
|        1|     Laptop|Electronics|50000|     4|       1|
|        2| Smartphone|Electronics|30000|     2|       1|
|        2| Smartphone|Electronics|30000|     6|       2|
|        3|      Table|  Furniture|15000|     3|       3|
|        3|      Table|  Furniture|15000|     8|       1|
|        4|      Chair|  Furniture| 5000|     5|       5|
|        5| Headphones|Electronics| 2000|     7|      10|
+---------+-----------+-----------+-----+------+--------+

Total sales value: 
+---------+-----------+-----------+-----+------+--------+---------------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|TotalSalesValue|
+---------+-----------+-----------+-----+------+--------+---------------+
|        1|     Laptop|Electronics|50000|