In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
      .appName("Basics") \
      .getOrCreate()

In [3]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
6,Kavya,Hyderabad,28,48000
7,Imran,Delhi,31,53000
8,Divya,Chennai,27,45000
9,Anil,Bangalore,40,85000
10,Ritu,Mumbai,23,39000
"""
with open("employees.csv","w")as f:
  f.write(data)

In [4]:
rdd=spark.sparkContext.textFile("employees.csv")
rdd.take(5)

['id,name,city,age,salary',
 '1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000']

In [5]:
header=rdd.first()
data_rdd=rdd.filter(lambda row: row!=header)
data_rdd.collect()

['1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000',
 '5,Sanjay,Mumbai,35,72000',
 '6,Kavya,Hyderabad,28,48000',
 '7,Imran,Delhi,31,53000',
 '8,Divya,Chennai,27,45000',
 '9,Anil,Bangalore,40,85000',
 '10,Ritu,Mumbai,23,39000']

In [6]:
split_rdd=data_rdd.map(lambda row: row.split(","))
split_rdd.take(3)

[['1', 'Arjun', 'Hyderabad', '25', '45000'],
 ['2', 'Meera', 'Chennai', '32', '52000'],
 ['3', 'Rajesh', 'Bangalore', '29', '61000']]

In [7]:
city_salary_rdd=split_rdd.map(lambda x:(x[2],int(x[4])))
city_salary_rdd.collect()

[('Hyderabad', 45000),
 ('Chennai', 52000),
 ('Bangalore', 61000),
 ('Delhi', 38000),
 ('Mumbai', 72000),
 ('Hyderabad', 48000),
 ('Delhi', 53000),
 ('Chennai', 45000),
 ('Bangalore', 85000),
 ('Mumbai', 39000)]

In [8]:
total_salary_per_city=city_salary_rdd.reduceByKey(lambda a,b: a+b)
total_salary_per_city.collect()

[('Hyderabad', 93000),
 ('Delhi', 91000),
 ('Mumbai', 111000),
 ('Chennai', 97000),
 ('Bangalore', 146000)]

In [9]:
highest_city=total_salary_per_city.reduce(
  lambda a,b: a if a[1]>b[1] else b
)
highest_city

('Bangalore', 146000)