In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
      .appName("Basics") \
      .getOrCreate()

In [2]:
data = """call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""
with open("call_records.csv","w")as f:
  f.write(data)

#Read the CSV file using sparkContext.textFile and display the first 5 records.

In [3]:
rdd=spark.sparkContext.textFile("call_records.csv")
rdd.take(5)

['call_id,caller,receiver,city,call_type,duration_seconds,cost',
 'C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0']

#Remove the header row and create a clean RDD containing only data rows.

In [4]:
header=rdd.first()
clean_rdd=rdd.filter(lambda row: row!=header)
clean_rdd.collect()

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5',
 'C006,Sneha,Karan,Hyderabad,Local,240,3.0',
 'C007,Karan,Sneha,Delhi,Local,120,2.0',
 'C008,Riya,Vikas,Bangalore,STD,360,6.5',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C010,Anjali,Sanjay,Chennai,Local,90,1.5',
 'C011,Farhan,Ayesha,Delhi,STD,420,7.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C013,Suresh,Divya,Bangalore,Local,150,2.0',
 'C014,Divya,Suresh,Mumbai,STD,380,6.8',
 'C015,Nikhil,Priya,Delhi,Local,200,2.8',
 'C016,Priya,Nikhil,Chennai,STD,410,7.2',
 'C017,Rohit,Kavya,Hyderabad,Local,170,2.3',
 'C018,Kavya,Rohit,Bangalore,Local,140,2.1',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0',
 'C020,Tina,Manish,Delhi,STD,350,6.2']

#Split each row into individual fields using a delimiter.

In [5]:
split_rdd=clean_rdd.map(lambda row: row.split(","))
split_rdd.take(3)

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'],
 ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'],
 ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0']]

#Calculate the total call cost per city.

In [8]:
city_call_cost=split_rdd.map(lambda x:(x[3],float(x[6])))
city_call_cost.collect()

[('Hyderabad', 2.5),
 ('Bangalore', 6.0),
 ('Delhi', 1.0),
 ('Mumbai', 25.0),
 ('Chennai', 7.5),
 ('Hyderabad', 3.0),
 ('Delhi', 2.0),
 ('Bangalore', 6.5),
 ('Mumbai', 30.0),
 ('Chennai', 1.5),
 ('Delhi', 7.0),
 ('Hyderabad', 28.0),
 ('Bangalore', 2.0),
 ('Mumbai', 6.8),
 ('Delhi', 2.8),
 ('Chennai', 7.2),
 ('Hyderabad', 2.3),
 ('Bangalore', 2.1),
 ('Mumbai', 27.0),
 ('Delhi', 6.2)]

In [9]:

call_cost_per_city=city_call_cost.reduceByKey(lambda a,b: a+b)
call_cost_per_city.collect()

[('Hyderabad', 35.8),
 ('Delhi', 19.0),
 ('Mumbai', 88.8),
 ('Bangalore', 16.6),
 ('Chennai', 16.2)]

#Identify the city with the highest total call cost.

In [10]:
highest_city_call=call_cost_per_city.reduce(lambda a,b: a if a[1]>b[1] else b)
highest_city_call

('Mumbai', 88.8)

#Calculate the total call duration per call type (Local, STD, ISD).

In [15]:
type_call_cost=split_rdd.map(lambda x:(x[4],float(x[5])))
type_call_cost.collect()

[('Local', 180.0),
 ('STD', 320.0),
 ('Local', 60.0),
 ('ISD', 900.0),
 ('STD', 400.0),
 ('Local', 240.0),
 ('Local', 120.0),
 ('STD', 360.0),
 ('ISD', 1100.0),
 ('Local', 90.0),
 ('STD', 420.0),
 ('ISD', 950.0),
 ('Local', 150.0),
 ('STD', 380.0),
 ('Local', 200.0),
 ('STD', 410.0),
 ('Local', 170.0),
 ('Local', 140.0),
 ('ISD', 1000.0),
 ('STD', 350.0)]

In [16]:

call_cost_per_type=type_call_cost.reduceByKey(lambda a,b: a+b)
call_cost_per_type.collect()

[('Local', 1350.0), ('STD', 2640.0), ('ISD', 3950.0)]

#Count the number of calls per city.

In [18]:

calls_per_city = (
    split_rdd
    .map(lambda cols: (cols[3], 1))
    .reduceByKey(lambda a, b: a + b)
)
calls_per_city.collect()


[('Hyderabad', 4),
 ('Delhi', 5),
 ('Mumbai', 4),
 ('Bangalore', 4),
 ('Chennai', 3)]

#Calculate the average call cost per city using RDD transformations.

In [20]:

avg_cost_per_city = (
    city_call_cost
    .aggregateByKey(
        (0.0, 0),
        lambda acc, v: (acc[0] + v, acc[1] + 1),
        lambda a, b: (a[0] + b[0], a[1] + b[1])
    )
    .mapValues(lambda sc: sc[0] / sc[1] if sc[1] > 0 else 0.0)
)

print(avg_cost_per_city.collect())




[('Hyderabad', 8.95), ('Delhi', 3.8), ('Mumbai', 22.2), ('Bangalore', 4.15), ('Chennai', 5.3999999999999995)]


#Filter and list all high-value calls where call cost is greater than 20.

In [22]:
high_value_calls = split_rdd.filter(lambda cols: float(cols[6]) > 20)
high_value_calls.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

#Count the number of ISD calls per city.

In [24]:

isd_calls_per_city = (
    split_rdd
    .filter(lambda cols: str(cols[4]).strip().upper() == "ISD")
    .map(lambda cols: (cols[3], 1))
    .reduceByKey(lambda a, b: a + b)
)
isd_calls_per_city.collect()


[('Mumbai', 3), ('Hyderabad', 1)]

#Identify the longest call based on call duration.

In [28]:
longest_call = split_rdd.max(key=lambda cols: int(cols[5]))
longest_call

['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0']

#Calculate the total revenue generated by each caller.

In [30]:

revenue_per_caller = (
    split_rdd
    .map(lambda cols: (cols[1], float(cols[6])))
    .reduceByKey(lambda a, b: a + b)
)

revenue_per_caller.collect()


[('Amit', 2.5),
 ('Pooja', 25.0),
 ('Karan', 2.0),
 ('Riya', 6.5),
 ('Vikas', 30.0),
 ('Suresh', 2.0),
 ('Divya', 6.8),
 ('Nikhil', 2.8),
 ('Rohit', 2.3),
 ('Manish', 27.0),
 ('Tina', 6.2),
 ('Neha', 6.0),
 ('Rahul', 1.0),
 ('Arjun', 7.5),
 ('Sneha', 3.0),
 ('Anjali', 1.5),
 ('Farhan', 7.0),
 ('Ayesha', 28.0),
 ('Priya', 7.2),
 ('Kavya', 2.1)]

#Detect suspicious calls based on the following rule:
duration greater than 900 seconds
cost greater than 25

In [33]:

suspicious_calls = split_rdd.filter(
    lambda cols: int(cols[5]) > 900 and float(cols[6]) > 25
)

suspicious_calls.collect()


[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]