In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
.appName("Read CSV Example")\
.getOrCreate()

In [2]:
data="""call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""
with open("call_records.csv","w") as f:
  f.write(data)

# Tasks / Exercises

# Task 1

Read the CSV file using sparkContext.textFile and display the first 5 records.

In [3]:
rdd=spark.sparkContext.textFile("call_records.csv")
rdd.take(5)

['call_id,caller,receiver,city,call_type,duration_seconds,cost',
 'C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0']

In [6]:
header=rdd.first()
data_rdd=rdd.filter(lambda row:row!=header)
data_rdd.take(5)

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5']

# Task 2

Remove the header row and create a clean RDD containing only data rows.

In [7]:
data_rdd.collect()

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5',
 'C006,Sneha,Karan,Hyderabad,Local,240,3.0',
 'C007,Karan,Sneha,Delhi,Local,120,2.0',
 'C008,Riya,Vikas,Bangalore,STD,360,6.5',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C010,Anjali,Sanjay,Chennai,Local,90,1.5',
 'C011,Farhan,Ayesha,Delhi,STD,420,7.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C013,Suresh,Divya,Bangalore,Local,150,2.0',
 'C014,Divya,Suresh,Mumbai,STD,380,6.8',
 'C015,Nikhil,Priya,Delhi,Local,200,2.8',
 'C016,Priya,Nikhil,Chennai,STD,410,7.2',
 'C017,Rohit,Kavya,Hyderabad,Local,170,2.3',
 'C018,Kavya,Rohit,Bangalore,Local,140,2.1',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0',
 'C020,Tina,Manish,Delhi,STD,350,6.2']

# Task 3

Split each row into individual fields using a delimiter.

In [8]:
split_rdd=data_rdd.map(lambda row:row.split(","))
split_rdd.collect()

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'],
 ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'],
 ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0'],
 ['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C005', 'Arjun', 'Amit', 'Chennai', 'STD', '400', '7.5'],
 ['C006', 'Sneha', 'Karan', 'Hyderabad', 'Local', '240', '3.0'],
 ['C007', 'Karan', 'Sneha', 'Delhi', 'Local', '120', '2.0'],
 ['C008', 'Riya', 'Vikas', 'Bangalore', 'STD', '360', '6.5'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C010', 'Anjali', 'Sanjay', 'Chennai', 'Local', '90', '1.5'],
 ['C011', 'Farhan', 'Ayesha', 'Delhi', 'STD', '420', '7.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C013', 'Suresh', 'Divya', 'Bangalore', 'Local', '150', '2.0'],
 ['C014', 'Divya', 'Suresh', 'Mumbai', 'STD', '380', '6.8'],
 ['C015', 'Nikhil', 'Priya', 'Delhi', 'Local', '200', '2.8'],
 ['C016', 'Priya', 'Nikhil', 'Chennai', 'STD', '410', '7.2'],
 ['

# Task 4

Calculate the total call cost per city.

In [9]:
city_call=split_rdd.map(lambda row:(row[3],float(row[6])))
city_call.collect()

[('Hyderabad', 2.5),
 ('Bangalore', 6.0),
 ('Delhi', 1.0),
 ('Mumbai', 25.0),
 ('Chennai', 7.5),
 ('Hyderabad', 3.0),
 ('Delhi', 2.0),
 ('Bangalore', 6.5),
 ('Mumbai', 30.0),
 ('Chennai', 1.5),
 ('Delhi', 7.0),
 ('Hyderabad', 28.0),
 ('Bangalore', 2.0),
 ('Mumbai', 6.8),
 ('Delhi', 2.8),
 ('Chennai', 7.2),
 ('Hyderabad', 2.3),
 ('Bangalore', 2.1),
 ('Mumbai', 27.0),
 ('Delhi', 6.2)]

In [10]:
total_call=city_call.reduceByKey(lambda x,y:x+y)
total_call.collect()

[('Hyderabad', 35.8),
 ('Delhi', 19.0),
 ('Mumbai', 88.8),
 ('Bangalore', 16.6),
 ('Chennai', 16.2)]

# Task 5

Identify the city with the highest total call cost.

In [12]:
highest_total_call_cost=total_call.reduce(lambda x,y:x if x[1]>y[1] else y)
highest_total_call_cost

('Mumbai', 88.8)

# Task 6
Calculate the total call duration per call type (Local, STD, ISD).

In [14]:
total_call_duration=split_rdd.map(lambda row:(row[4],float(row[6])))
total_call_duration.collect()

[('Local', 2.5),
 ('STD', 6.0),
 ('Local', 1.0),
 ('ISD', 25.0),
 ('STD', 7.5),
 ('Local', 3.0),
 ('Local', 2.0),
 ('STD', 6.5),
 ('ISD', 30.0),
 ('Local', 1.5),
 ('STD', 7.0),
 ('ISD', 28.0),
 ('Local', 2.0),
 ('STD', 6.8),
 ('Local', 2.8),
 ('STD', 7.2),
 ('Local', 2.3),
 ('Local', 2.1),
 ('ISD', 27.0),
 ('STD', 6.2)]

In [15]:
total_call_per_duration_per_calltype=total_call_duration.reduceByKey(lambda x,y:x+y)
total_call_per_duration_per_calltype.collect()

[('Local', 19.2), ('STD', 47.2), ('ISD', 110.0)]

# Task 7

Count the number of calls per city.

In [16]:
number_of_calls_per_city=split_rdd.map(lambda row:(row[3],1))
number_of_calls_per_city.collect()

[('Hyderabad', 1),
 ('Bangalore', 1),
 ('Delhi', 1),
 ('Mumbai', 1),
 ('Chennai', 1),
 ('Hyderabad', 1),
 ('Delhi', 1),
 ('Bangalore', 1),
 ('Mumbai', 1),
 ('Chennai', 1),
 ('Delhi', 1),
 ('Hyderabad', 1),
 ('Bangalore', 1),
 ('Mumbai', 1),
 ('Delhi', 1),
 ('Chennai', 1),
 ('Hyderabad', 1),
 ('Bangalore', 1),
 ('Mumbai', 1),
 ('Delhi', 1)]

# Task 8

Calculate the average call cost per city using RDD transformations.

In [22]:
call_counts_per_city_reduced = number_of_calls_per_city.reduceByKey(lambda x, y: x + y)

average_call_cost_per_city = total_call.join(call_counts_per_city_reduced).map(lambda x: (x[0], x[1][0] / x[1][1]))
average_call_cost_per_city.collect()

[('Hyderabad', 8.95),
 ('Delhi', 3.8),
 ('Mumbai', 22.2),
 ('Bangalore', 4.15),
 ('Chennai', 5.3999999999999995)]

# Task 9

Filter and list all high-value calls where call cost is greater than 20.

In [23]:
high_value_calls=split_rdd.filter(lambda row:float(row[6])>20)
high_value_calls.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

# Task 10

Count the number of ISD calls per city.

In [24]:
number_of_ISD_calls=split_rdd.filter(lambda row:row[4]=="ISD")
number_of_ISD_calls.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

# Task 11

Identify the longest call based on call duration.

In [28]:
longest_call=split_rdd.map(lambda row:(row[0],float(row[5])))
longest_call_on_duration=longest_call.reduce(lambda x,y:x if x[1]>y[1] else y)
longest_call_on_duration

('C009', 1100.0)

# Task 12

Calculate the total revenue generated by each caller.

In [29]:
total_revenue=split_rdd.map(lambda row:(row[1],float(row[6])))
total_revenue.collect()

[('Amit', 2.5),
 ('Neha', 6.0),
 ('Rahul', 1.0),
 ('Pooja', 25.0),
 ('Arjun', 7.5),
 ('Sneha', 3.0),
 ('Karan', 2.0),
 ('Riya', 6.5),
 ('Vikas', 30.0),
 ('Anjali', 1.5),
 ('Farhan', 7.0),
 ('Ayesha', 28.0),
 ('Suresh', 2.0),
 ('Divya', 6.8),
 ('Nikhil', 2.8),
 ('Priya', 7.2),
 ('Rohit', 2.3),
 ('Kavya', 2.1),
 ('Manish', 27.0),
 ('Tina', 6.2)]

# Task 13

Detect suspicious calls based on the following rule:
duration greater than 900 seconds
cost greater than 25

In [32]:
suspicious_calls=split_rdd.filter(lambda row:float(row[5])>900 and float(row[6]) > 25)
suspicious_calls.collect()

[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]