In [1]:
!pip install pyspark




In [20]:
from pyspark.sql import SparkSession


In [21]:
spark = SparkSession.builder.appName("CodtechBigDataAnalysis").getOrCreate()


In [25]:
data_path = "/content/store_sales (1).csv"  # Adjust if different path after upload
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show(5)


+----------+-----+------+-----+-------+
|      date|store| sales|promo|holiday|
+----------+-----+------+-----+-------+
|2022-01-01|    1|184.78|    0|      0|
|2022-01-02|    1|192.62|    0|      0|
|2022-01-03|    1|212.68|    0|      0|
|2022-01-04|    1|249.58|    1|      0|
|2022-01-05|    1| 223.5|    0|      0|
+----------+-----+------+-----+-------+
only showing top 5 rows



In [26]:
# Show schema
df.printSchema()

# Count total rows
total_rows = df.count()
print("✅ Total Rows:", total_rows)


root
 |-- date: date (nullable = true)
 |-- store: integer (nullable = true)
 |-- sales: double (nullable = true)
 |-- promo: integer (nullable = true)
 |-- holiday: integer (nullable = true)

✅ Total Rows: 7300


In [27]:
# Describe numerical columns
df.describe().show()

# Group by store and calculate total sales per store
df.groupBy("store").sum("sales").show()

# Calculate average sales when promo is active vs not active
df.groupBy("promo").avg("sales").show()

# Calculate total sales on holiday vs non-holiday
df.groupBy("holiday").sum("sales").show()


+-------+------------------+-----------------+-------------------+-------------------+
|summary|             store|            sales|              promo|            holiday|
+-------+------------------+-----------------+-------------------+-------------------+
|  count|              7300|             7300|               7300|               7300|
|   mean|               5.5|228.4345712328766| 0.2021917808219178|0.10410958904109589|
| stddev|2.8724780750809566|26.62563608103706|0.40166200332361596|0.30542357649795304|
|    min|                 1|           160.71|                  0|                  0|
|    max|                10|           340.73|                  1|                  1|
+-------+------------------+-----------------+-------------------+-------------------+

+-----+------------------+
|store|        sum(sales)|
+-----+------------------+
|    1|169912.35999999993|
|    6|172590.87999999992|
|    3|159553.74999999997|
|    5|         161479.82|
|    9|178600.68999999983|


In [28]:
spark.stop()


### 🔶 INSIGHTS

✅ Total records analysed: [insert total rows output]
➔ Indicates data coverage for this sales analysis task.

✅ Total sales per store:
➔ Helps identify top-performing stores for prioritising inventory and marketing spend.

✅ Average sales: Promo vs Non-Promo days:
➔ Reveals promotion effectiveness. Higher average sales during promo indicates successful promotional campaigns.

✅ Total sales on holidays vs non-holidays:
➔ Shows consumer buying behaviour on holidays. If sales increase on holidays, strategic holiday discounts can be planned.

✅ Overall Interpretation:
➔ The dataset demonstrates how PySpark efficiently processes thousands of records, enabling data-driven decisions at scale in retail business analysis.


