In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d9b4c2ef9ed04fccf0a14f2215453c8c9859169596b68738534e09a9ea61709f
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
import pandas as pd
from datetime import datetime

# Sample sales data
data = {
    "TransactionID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "CustomerID": [101, 102, 103, 101, 104, 102, 103, 104, 101, 105],
    "ProductID": [501, 502, 501, 503, 504, 502, 503, 504, 501, 505],
    "Quantity": [2, 1, 4, 3, 1, 2, 5, 1, 2, 1],
    "Price": [150.0, 250.0, 150.0, 300.0, 450.0, 250.0, 300.0, 450.0, 150.0, 550.0],
    "Date": [
        datetime(2024, 9, 1),
        datetime(2024, 9, 1),
        datetime(2024, 9, 2),
        datetime(2024, 9, 2),
        datetime(2024, 9, 3),
        datetime(2024, 9, 3),
        datetime(2024, 9, 4),
        datetime(2024, 9, 4),
        datetime(2024, 9, 5),
        datetime(2024, 9, 5)
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('sales_data.csv', index=False)

print("Sample sales dataset has been created and saved as 'sales_data.csv'.")


Sample sales dataset has been created and saved as 'sales_data.csv'.


In [11]:
# Part-2
# Step-2: Load the dataset into the pyspark
# 1. You need to create a Spark session to start working with PySpark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName("SalesAnalysis")\
.getOrCreate()

# 2. Load the sales_data.csv file
sales_df = spark.read.csv("sales_data.csv", header=True, inferSchema=True)
sales_df.show()


+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
|            6|       102|      502|       2|250.0|2024-09-03|
|            7|       103|      503|       5|300.0|2024-09-04|
|            8|       104|      504|       1|450.0|2024-09-04|
|            9|       101|      501|       2|150.0|2024-09-05|
|           10|       105|      505|       1|550.0|2024-09-05|
+-------------+----------+---------+--------+-----+----------+



In [5]:
# Step-3: Explore the data
# 1. Print the schema
sales_df.printSchema()

# 2. Display the first few rows
sales_df.show(5)

# 3.Get Summary statistics
sales_df.describe(['Quantity','Price']).show()

root
 |-- TransactionID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Date: date (nullable = true)

+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows

+-------+-----------------+-----------------+
|summary|         Quantity|            Price|
+-------+-----------------+-----------------+
|  count|            

In [9]:
# Step-4: Perform data transformation and Analysis
# 1. Calculate the total sales value for each transaction
from pyspark.sql.functions import col, sum
sales_df = sales_df.withColumn("TotalSales", col("Quantity") * col("Price"))
sales_df.show(5)

# 2.Group by productID and Calculate Total sales per product
product_sales_df = sales_df.groupBy("ProductID").sum("TotalSales").alias("TotalSales")
product_sales_df.show()

# 3.Identify the top-selling product
top_product = product_sales_df.orderBy(col("sum(TotalSales)").desc()).limit(1)
top_product.show()

# 4.Group by "Date" and sum "TotalSales"
daily_sales_df = sales_df.groupBy("Date").sum("TotalSales")
daily_sales_df.show()

# 5. Filter the transactions where TotalSales > 500
high_sales_df = sales_df.filter(col("TotalSales") > 500)
high_sales_df.show()

+-------------+----------+---------+--------+-----+----------+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|TotalSales|
+-------------+----------+---------+--------+-----+----------+----------+
|            1|       101|      501|       2|150.0|2024-09-01|     300.0|
|            2|       102|      502|       1|250.0|2024-09-01|     250.0|
|            3|       103|      501|       4|150.0|2024-09-02|     600.0|
|            4|       101|      503|       3|300.0|2024-09-02|     900.0|
|            5|       104|      504|       1|450.0|2024-09-03|     450.0|
+-------------+----------+---------+--------+-----+----------+----------+
only showing top 5 rows

+---------+---------------+
|ProductID|sum(TotalSales)|
+---------+---------------+
|      501|         1200.0|
|      504|          900.0|
|      502|          750.0|
|      505|          550.0|
|      503|         2400.0|
+---------+---------------+

+---------+---------------+
|ProductID|sum(TotalSales)|


In [10]:
# Additional Exercise
# 1. Count how many times each customer has made a purchase and display the customers who have made more than one purchase
customer_purchase_count = sales_df.groupBy("CustomerID").count().filter(col("count") > 1)
customer_purchase_count.show()

# 2. Calculate the average price per unit for each product and display the results.
avg_price_per_unit = sales_df.groupBy("ProductID").avg("Price").alias("AvgPricePerUnit")
avg_price_per_unit.show()

+----------+-----+
|CustomerID|count|
+----------+-----+
|       101|    3|
|       103|    2|
|       102|    2|
|       104|    2|
+----------+-----+

+---------+----------+
|ProductID|avg(Price)|
+---------+----------+
|      501|     150.0|
|      504|     450.0|
|      502|     250.0|
|      505|     550.0|
|      503|     300.0|
+---------+----------+

