In [0]:
import pyspark 
from pyspark.sql import functions as F
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Call Center').getOrCreate()


# File location and type
file_location = "/FileStore/tables/Call_Center.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
data = df
display(data)

id,customer_name,sentiment,csat_score,call_timestamp,reason,city,state,channel,response_time,call duration in minutes,call_center
DKK-57076809-w-055481-fU,Analise Gairdner,Neutral,7.0,2020-10-29,Billing Question,Detroit,Michigan,Call-Center,Within SLA,17,Los Angeles/CA
QGK-72219678-w-102139-KY,Crichton Kidsley,Very Positive,,2020-10-05,Service Outage,Spartanburg,South Carolina,Chatbot,Within SLA,23,Baltimore/MD
GYJ-30025932-A-023015-LD,Averill Brundrett,Negative,,2020-10-04,Billing Question,Gainesville,Florida,Call-Center,Above SLA,45,Los Angeles/CA
ZJI-96807559-i-620008-m7,Noreen Lafflina,Very Negative,1.0,2020-10-17,Billing Question,Portland,Oregon,Chatbot,Within SLA,12,Los Angeles/CA
DDU-69451719-O-176482-Fm,Toma Van der Beken,Very Positive,,2020-10-17,Payments,Fort Wayne,Indiana,Call-Center,Within SLA,23,Los Angeles/CA
JVI-79728660-U-224285-4a,Kaylyn Emlen,Neutral,5.0,2020-10-28,Billing Question,Salt Lake City,Utah,Call-Center,Within SLA,25,Baltimore/MD
AZI-95054097-e-185542-PT,Phillipe Bowring,Neutral,8.0,2020-10-16,Billing Question,Tyler,Texas,Chatbot,Within SLA,31,Baltimore/MD
TWX-27007918-I-608789-Xw,Krysta de Tocqueville,Positive,,2020-10-21,Billing Question,New York City,New York,Chatbot,Below SLA,37,Los Angeles/CA
XNG-44599118-P-344473-ZU,Oran Lifsey,Very Negative,,2020-10-03,Billing Question,Dallas,Texas,Email,Below SLA,37,Baltimore/MD
RLC-64108207-Z-285141-VS,Port Inggall,Neutral,,2020-10-07,Billing Question,Cincinnati,Ohio,Chatbot,Within SLA,12,Baltimore/MD


In [0]:
# Create a view or table

temp_table_name = "Call_Center_csv"

df.createOrReplaceTempView(temp_table_name)

**Total Calls**

In [0]:
display(data.select(F.count('id')).alias('Calls'))

count(id)
32941


Databricks visualization. Run in Databricks to view.

**Total Calls by Duration**

In [0]:
display(data.select(col('call duration in minutes')).agg(sum('call duration in minutes')))

sum(call duration in minutes)
824222


Databricks visualization. Run in Databricks to view.

**Average Call Duration**

In [0]:
display(data.select(col('call duration in minutes')).agg(avg('call duration in minutes')))

avg(call duration in minutes)
25.02115904192344


Databricks visualization. Run in Databricks to view.

**CSAT Score**

In [0]:
display(data.select(col('csat_score')).agg(avg('csat_score')))

avg(csat_score)
5.548447559286122


Databricks visualization. Run in Databricks to view.

**Number of Calls by State**

In [0]:
# Group by the 'state' column and count the number of 'id's in each group, aliasing the result as 'No. of Calls'
display(data.groupBy("state").count())

state,count
Utah,298
Hawaii,149
Minnesota,712
Ohio,1160
Oregon,261
Arkansas,204
Texas,3572
North Dakota,76
Pennsylvania,1017
Connecticut,408


Databricks visualization. Run in Databricks to view.

**Calls by Sentiment**

In [0]:
display(data.groupBy("sentiment").count())

sentiment,count
Very Negative,6026
Very Positive,3170
Neutral,8754
Positive,3928
Negative,11063


Databricks visualization. Run in Databricks to view.

**Calls with Response Time**

In [0]:
display(data.groupBy("response_time").count())

response_time,count
Within SLA,20625
Above SLA,4168
Below SLA,8148


Databricks visualization. Run in Databricks to view.

**Calls by Channels**

In [0]:
display(data.groupBy("channel").count())

channel,count
Email,7470
Chatbot,8256
Call-Center,10639
Web,6576


Databricks visualization. Run in Databricks to view.

**No. of Calls by City**

In [0]:
display(data.groupBy('city').count())

city,count
Tyler,61
Worcester,49
Fairbanks,38
Springfield,304
Charleston,219
Harrisburg,79
Corona,42
Tempe,19
Lawrenceville,43
North Las Vegas,29


Databricks visualization. Run in Databricks to view.

**Calls by Reason**

In [0]:
display(data.groupBy('reason').count())

reason,count
Payments,4749
Service Outage,4730
Billing Question,23462


Databricks visualization. Run in Databricks to view.

**Calls Received by the Call Center**

In [0]:
from pyspark.sql.functions import count

# Group by the 'state' column and count the number of 'id's in each group, then sort the results by count
display(data.groupBy("call_center").agg(count("id").alias("Number of Calls")).orderBy("Number of Calls", ascending=False))



call_center,Number of Calls
Los Angeles/CA,13734
Baltimore/MD,11012
Chicago/IL,5419
Denver/CO,2776


Databricks visualization. Run in Databricks to view.