In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('15-12-2025').getOrCreate()

In [21]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
"""

with open("employees.csv","w") as f:
    f.write(data)

In [22]:
#reading csv file
df = spark.read \
    .option("header","true") \
    .option("inferSchema","true") \
    .csv("employees.csv")

df.show()
df.printSchema()

+---+------+---------+---+------+
| id|  name|     city|age|salary|
+---+------+---------+---+------+
|  1| Arjun|Hyderabad| 25| 45000|
|  2| Meera|  Chennai| 32| 52000|
|  3|Rajesh|Bangalore| 29| 61000|
|  4| Priya|    Delhi| 22| 38000|
|  5|Sanjay|   Mumbai| 35| 72000|
+---+------+---------+---+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [23]:
json_data = [
  {
    "id": 1,
    "name": "Arjun",
    "city": "Hyderabad",
    "age": 25,
    "salary": 45000
  },
  {
    "id": 2,
    "name": "Meera",
    "city": "Chennai",
    "age": 32,
    "salary": 52000
  },
  {
    "id": 3,
    "name": "Rajesh",
    "city": "Bangalore",
    "age": 29,
    "salary": 61000
  },
  {
    "id": 4,
    "name": "Priya",
       "city": "Delhi",
    "age": 22,
    "salary": 38000
  },
  {
    "id": 5,
    "name": "Sanjay",
    "city": "Mumbai",
    "age": 35,
    "salary": 72000
  }]


In [24]:
# reading and writing json file
df_json = spark.createDataFrame(json_data)
df.write.mode("overwrite").json("employees.json")

df_json.show()
df_json.printSchema()

+---+---------+---+------+------+
|age|     city| id|  name|salary|
+---+---------+---+------+------+
| 25|Hyderabad|  1| Arjun| 45000|
| 32|  Chennai|  2| Meera| 52000|
| 29|Bangalore|  3|Rajesh| 61000|
| 22|    Delhi|  4| Priya| 38000|
| 35|   Mumbai|  5|Sanjay| 72000|
+---+---------+---+------+------+

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [25]:
# writing and reading parquet file
df.write.mode("overwrite").parquet("employees.parquet")
df_parquet = spark.read.parquet("employees.parquet")
df_parquet.show()
df_parquet.printSchema()

+---+------+---------+---+------+
| id|  name|     city|age|salary|
+---+------+---------+---+------+
|  1| Arjun|Hyderabad| 25| 45000|
|  2| Meera|  Chennai| 32| 52000|
|  3|Rajesh|Bangalore| 29| 61000|
|  4| Priya|    Delhi| 22| 38000|
|  5|Sanjay|   Mumbai| 35| 72000|
+---+------+---------+---+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)

