**DISTRIBUTED DATA PROCESSING**

**USE APACHE SPARK TO ANALYZE A LARGE DATASET, IMPLEMENTING OPERATIONS LIKE FILTERING, GROUPING, AND AGGREGATIONS.**

In [3]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count

spark = SparkSession.builder.appName("LargeDatasetAnalysis").getOrCreate()

In [5]:
data_file = "/content/GOOG.csv" # Update with your file path
try:
  df = spark.read.csv(data_file, header=True, inferSchema=True)
except Exception as e:
  print(f"Error loading data: {e}")
  print(f"Ensure file path is correct and dataset exists at: {data_file}")
  spark.stop()
  exit()

In [6]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [10]:
# 1. Filtering
# Filter rows where a specific column value is greater than a certain threshold
filtered_df = df.filter(col("Adj Close") > 1400)
print("Filtered Data:")
filtered_df.show()

Filtered Data:
+----------+-----------+-----------+-----------+-----------+-----------+-------+
|      Date|       Open|       High|        Low|      Close|  Adj Close| Volume|
+----------+-----------+-----------+-----------+-----------+-----------+-------+
|20-05-2020|1389.579956|1410.420044|    1387.25|1406.719971|1406.719971|1655400|
|21-05-2020|     1408.0| 1415.48999|1393.449951|1402.800049|1402.800049|1385000|
|22-05-2020|1396.709961| 1412.76001|1391.829956|1410.420044|1410.420044|1309400|
|26-05-2020| 1437.27002|     1441.0|1412.130005| 1417.02002| 1417.02002|2060600|
|27-05-2020|    1417.25| 1421.73999|1391.290039|1417.839966|1417.839966|1685800|
|20-05-2020|1396.859985|1440.839966|     1396.0| 1416.72998| 1416.72998|1692200|
|21-05-2020|1416.939941|1432.569946|1413.349976|1428.920044|1428.920044|1838100|
|22-05-2020|1418.390015|1437.959961|     1418.0|1431.819946|1431.819946|1217100|
|02-06-2020|1430.550049|1439.609985|1418.829956|1439.219971|1439.219971|1278100|
|03-06-2020|1

In [11]:
# 2. Grouping
# Group the data by a specific column and calculate the sum of another column
grouped_df = df.groupBy("High").agg(sum("Open")) # Replace with your columns
# Alternative aggregation functions
grouped_df = df.groupBy("High").agg(avg("Open"), count("Open"))
print("Grouped Data:")
grouped_df.show()

Grouped Data:
+-----------+-----------+-----------+
|       High|  avg(Open)|count(Open)|
+-----------+-----------+-----------+
|    1537.25|1510.339966|          1|
|1562.469971|1526.180054|          1|
|1482.410034|1469.300049|          1|
|1410.420044|1389.579956|          1|
|     1510.0|1492.439941|          1|
|1474.259033|1459.540039|          1|
| 1421.73999|    1417.25|          1|
|1440.839966|1396.859985|          1|
| 1447.98999|1422.339966|          1|
|1447.800049|     1444.0|          1|
|1522.719971|1506.449951|          1|
|1438.959961|1430.400024|          1|
|1505.880005|1494.319946|          1|
| 1526.47998|1525.180054|          1|
|     1437.0| 1428.48999|          1|
|1433.449951|1431.390015|          1|
|     1443.0|1411.099976|          1|
|     1570.0|     1560.5|          1|
|1543.829956|1506.150024|          1|
|1570.290039| 1515.26001|          1|
+-----------+-----------+-----------+
only showing top 20 rows



In [12]:
# 3. Aggregations
# Calculate the total sum, average, and count of a specific column
total_sum = df.select(sum("Close")).collect()[0][0] # Replace with your column
average = df.select(avg("Close")).collect()[0][0]
total_count = df.count()
print(f"Total Sum: {total_sum}")
print(f"Average: {average}")
print(f"Total Count: {total_count}")

Total Sum: 94223.05945000003
Average: 1472.2353039062505
Total Count: 64


In [13]:
spark.stop()