In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("Local_app").getOrCreate()

In [7]:
schema = StructType([
    StructField("S.no", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("Dept", IntegerType(), True)
])

# Data
data = [{"S.no":1, "Name":'Tharun', "Salary":50000, "Dept":5},
        {"S.no":2, "Name":'Arun', "Salary":30000, "Dept":5},
        {"S.no":3, "Name":'Varun', "Salary":40000, "Dept":4},
        {"S.no":4, "Name":'Kiran', "Salary":20000, "Dept":4}]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show DataFrame
print("DataFrame:")
df.show()

DataFrame:
+----+------+------+----+
|S.no|  Name|Salary|Dept|
+----+------+------+----+
|   1|Tharun| 50000|   5|
|   2|  Arun| 30000|   5|
|   3| Varun| 40000|   4|
|   4| Kiran| 20000|   4|
+----+------+------+----+



In [8]:
print("Group by Dept and compute average salary:")
df.groupBy("Dept").agg({"Salary": "avg"}).show()

Group by Dept and compute average salary:
+----+-----------+
|Dept|avg(Salary)|
+----+-----------+
|   5|    40000.0|
|   4|    30000.0|
+----+-----------+



In [9]:
print("Filtering rows where salary is greater than 30000:")
df.filter(df["Salary"] > 30000).show()

Filtering rows where salary is greater than 30000:
+----+------+------+----+
|S.no|  Name|Salary|Dept|
+----+------+------+----+
|   1|Tharun| 50000|   5|
|   3| Varun| 40000|   4|
+----+------+------+----+



In [13]:
data_2 = [(1,'Tharun',500000),(2,'Teja',600000),(3,'Anil',700000),(4,'Akhil',400000)]
df_2=spark.createDataFrame(data_2)
df_2.show()
df_2.printSchema()

+---+------+------+
| _1|    _2|    _3|
+---+------+------+
|  1|Tharun|500000|
|  2|  Teja|600000|
|  3|  Anil|700000|
|  4| Akhil|400000|
+---+------+------+

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)

