<a href="https://colab.research.google.com/github/ShrutiThakar/BIG-DATA-ANALYSIS/blob/main/TASK_1(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install PySpark
!pip install -q pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count
import pandas as pd, numpy as np, shutil
from IPython.display import FileLink

# Spark session
spark = SparkSession.builder.appName("BigDataProcessing").getOrCreate()

# Generate and save synthetic CSV
pd.DataFrame({
    "id": np.arange(1, 1_000_001),
    "age": np.random.randint(18, 70, 1_000_000),
    "salary": np.random.randint(30000, 150000, 1_000_000),
    "department": np.random.choice(["HR", "Engineering", "Sales", "Marketing"], 1_000_000)
}).to_csv("synthetic_data.csv", index=False)

# Load data
data = spark.read.csv("synthetic_data.csv", header=True, inferSchema=True)

# Print schema
print("Schema:")
data.printSchema()

# Processing
high_earners = data.filter(col("salary") > 100000)
avg_salary = data.groupBy("department").agg(avg("salary").alias("avg_salary"))
age_group_count = data.withColumn("age_group", (col("age") / 10).cast("int") * 10) \
                      .groupBy("age_group").agg(count("*").alias("count"))

# Show insights
print("Top 5 High Earners:"); high_earners.show(5)
print("Avg Salary by Dept:"); avg_salary.show()
print("Employee Count by Age Group:"); age_group_count.orderBy("age_group").show()

# Save and zip output
avg_salary.coalesce(1).write.option("header", "true").csv("avg_salary_output", mode="overwrite")
shutil.make_archive('avg_salary_output', 'zip', 'avg_salary_output')

# Download link
FileLink("avg_salary_output.zip")


Schema:
root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

Top 5 High Earners:
+---+---+------+-----------+
| id|age|salary| department|
+---+---+------+-----------+
|  1| 65|116867|      Sales|
|  2| 55|138445|      Sales|
|  4| 53|107430|Engineering|
|  7| 26|110474|      Sales|
|  8| 23|139694|      Sales|
+---+---+------+-----------+
only showing top 5 rows

Avg Salary by Dept:
+-----------+-----------------+
| department|       avg_salary|
+-----------+-----------------+
|      Sales|90120.84908089264|
|Engineering|89954.20254448235|
|         HR|90069.48091322597|
|  Marketing|90120.86017057637|
+-----------+-----------------+

Employee Count by Age Group:
+---------+------+
|age_group| count|
+---------+------+
|       10| 38364|
|       20|192810|
|       30|191908|
|       40|192229|
|       50|192641|
|       60|192048|
+---------+------+

