In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt
import re
import joblib

import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action = 'ignore', category = SettingWithCopyWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)
warnings.filterwarnings(action = 'ignore', category = FutureWarning)


import boto3

META_RAW_FILEPATH = "s3://fwd-sg-sagemaker-raw-data/voice_analytics"
CRM_RAW_FILEPATH = "s3://fwd-sg-sagemaker-raw-data/call_center"

In [2]:
### Load data extracted from Step - 0 for merging into dataset ###

# Load Call records extracted from Amazon Connect
metadata = pd.read_csv('{}/ContactSearchResults_agg.csv'.format(META_RAW_FILEPATH))


# Load extracted CRM data from Step 0  
crmdata = pd.read_csv('{}/dl_crm.Filterednew_calllog_extraction000'.format(CRM_RAW_FILEPATH))

## Fuzzy Match Preparation for AMZ Connect Dataset

In [6]:
# The Amazon Connect Dataset needs to be filtered and transformed for merging.

# Remove calls not connected to agent
metadata = metadata[metadata['Agent'].notna()]

# Remove calls that are likely to be automated messages 
# Eg: d4a6bd73-a54b-4a8f-bee4-e0bff4c96fc7 is an example with "38s duration" 
metadata = metadata[pd.to_timedelta(metadata['Contact duration']) > dt.timedelta(seconds=45)]
metadata.shape[0]

# Clean contact number
metadata['Customer phone number'] = metadata['Customer phone number'].str[2:]
metadata['Customer phone number'] = metadata['Customer phone number'].str.replace('.00','',regex=False)


# Turn timestamp to datetime, and get epoch timestamp
metadata['Initiation timestamp'] = pd.to_datetime(metadata['Initiation timestamp'])
metadata['ts'] = metadata['Initiation timestamp'].values.astype(np.int64) // 10 ** 9
metadata['ts'] = metadata['ts'].apply(lambda x: np.nan if x < 0 else x)

metadata['Disconnect timestamp'] = pd.to_datetime(metadata['Disconnect timestamp'])
metadata['ts_end'] = metadata['Disconnect timestamp'].values.astype(np.int64) // 10 ** 9
metadata['ts_end'] = metadata['ts_end'].apply(lambda x: np.nan if x < 0 else x)

## Fuzzy Match Preparation for CRM Dataset

In [7]:
# The CRM Dataset needs to be filtered and transformed for merging.

# Turn timestamp to datetime, and get epoch timestamp
crmdata['interaction_date'] = pd.to_datetime(crmdata['call_time'])
crmdata['ts'] = crmdata['interaction_date'].values.astype(np.int64) // 10 ** 9
crmdata['ts'] = crmdata['ts'].apply(lambda x: np.nan if x < 0 else x)

# Get Calls classified under "Self-Help - Assist" only.
crmdata = crmdata[crmdata['subcategory1'].isin(['Self Help - Assist'])]

# Fuzzy Matching Function 

Connect function is to be applied on AMZ Connect Metadata Table. It fuzzy matches the connect metadata table and interaction table by querying for a match in the interaction table for every connect metadata row with 3 conditions using pandas apply function and adding the interaction_id of the matched to the connect metadata table for merging (inter_id).

The 3 conditions are as follows:

1. Same Customer Number (if multiple interactions for cust no. match to the closest start time)
2. Start time of tables within a specified time (RFE) AND Same agent (multiple match possible, in that case count as no match as put string in inter_id column)
3. End time of tables within a specified time (RFE) AND Same agent (multiple match possible, in that case count as no match as put string in inter_id column)

The "rfe" variable can be optimised to find the value with the most amount of matches (too high many multiple match, too low too little matches) 

NOTE: Accuracy of matches with conditions 2 and 3 are also not guaranteed, especially with higher "rfe" values

In [8]:
### Remove interactions which are non-phone based (email based), fro the purposes of analysis' ###

print('Self_assist interactions: {}'.format(len(crmdata)))
crmdata = crmdata[crmdata['customer_phone'].apply(lambda x: re.search('^[0-9]*$', str(x))).notnull()]
print('Self_assist calls: {}'.format(len(crmdata)))
# Deduplicate the crmdata - deduplication based on phone number and call time. 
crmdata = crmdata.drop_duplicates(['customer_phone','call_time'])
print('Self_assist calls deduped: {}'.format(len(crmdata)))
# Only accept inbound calls as per Lina's suggestion
crmdata = crmdata[crmdata['call_direction'] == 'Inbound']
print('Self_assist calls inbound: {}'.format(len(crmdata)))

# Cleaning for metadata - the phone numbers have one extra number
metadata['Customer phone number2'] = metadata['Customer phone number'].apply(lambda x: x[1:])

nb_matches = len(set(metadata['Customer phone number2']).intersection(crmdata['customer_phone']))
perc_matches = nb_matches / len(crmdata)

print('Number of phone number matches: {}'.format(nb_matches))
print('Percentage of phone numbers out of crm logs matched: {}'.format(perc_matches))

Self_assist interactions: 964
Self_assist calls: 752
Self_assist calls deduped: 722
Self_assist calls inbound: 607
Number of phone number matches: 578
Percentage of phone numbers out of crm logs matched: 0.9522240527182867


In [9]:
### Create join keys and merging both dataset ###

# Map agent to email as joining key between AMZ Connect and CRM datasets
agent_email_mapping_dict = {
    'Adeline Wong Sook Kuan': 'adeline.wong@fwd.com',
    'Bobby Teng Hao Han': 'bobby.teng@fwd.com',
    'CRM Administrator': None,
    'Donna Ang Lay Kheng': 'donna.ang@fwd.com',
    'Eric Jiang Hailong': 'eric.jiang@fwd.com',
    'Ethan Lua Wei Jun': 'ethan.lua@fwd.com',
    'Eugene Jacob Chan Dong Hao': 'eugene.chan@fwd.com',
    'Evon Thoo Yee Wan': 'evon.thoo@fwd.com',
    'Gavin Poon Joun Wae': 'gavin.poon@fwd.com',
    'Go Irish Dianne Vinaviles': 'irish.go@fwd.com',
    'Grace Chong Wei Hui': 'grace.chong@fwd.com',
    'Idio Kristine Avril Bebe': 'kristine.idio@fwd.com',
    'Jayde Ng Shu Ying': 'jayde.ng@fwd.com',
    'Kasturi Konasegaran': 'Kasturi.Konasegaran@fwd.com',
    'Kokilaletchmi D/O Govindasamy': 'koki.govindasamy@fwd.com',
    'Lim Yu Yan': 'yuyan.lim@fwd.com',
    'Marissa Binte Ismail': 'marissa.ismail@fwd.com',
    'Mooralli Raj S/O Sumoo Amatharlingam': None,
    'Muhammad Ridzwan Bin Yusri' : 'ridzwan.yusri@fwd.com',
    'Ngieng Sue Min': 'suemin.ngieng@fwd.com',
    'Noraini Taraman': 'noraini.taraman@fwd.com',
    'Nur Azlina Binte Abdul Rahman' : 'nurazlina.rahman@fwd.com',
    'Nurwati Binte Jamil': 'nurwati.jamil@fwd.com',
    'Pamella Krystle Bonayon Flores': 'pamella.flores@fwd.com',
    'Resie Arjonillo': 'resie.arjonillo@fwd.com',
    'Tracy Hwang Poh Sim': 'tracy.hwang@fwd.com',
    'Villegas Karl Geoff Duques': 'karl.villegas@fwd.com',         
}

crmdata['agent_email'] = crmdata['agent_name'].map(agent_email_mapping_dict)

# Create join keys for dataset
crmdata['join_key'] = crmdata['customer_phone'].astype(str) + crmdata['agent_email']
metadata['join_key'] = metadata['Customer phone number2'].astype(str) + metadata['Agent']

# Join the data
joined_data = pd.merge(crmdata, metadata, how = 'inner', left_on = 'join_key', right_on = 'join_key')

In [10]:
### Filter and match data based on timestamp ###

# Match the call_time (crmdata) and initiation_timestamp (metadata), or else row is thrown away
joined_data['call_time'] = pd.to_datetime(joined_data['call_time'])
joined_data['call_time_lower'] = joined_data['call_time'] - pd.Timedelta(minutes = 2)
joined_data['call_time_upper'] = joined_data['call_time'] + pd.Timedelta(minutes = 2)
joined_data = joined_data[((joined_data['Initiation timestamp'] <= joined_data['call_time_upper']) & 
                           (joined_data['Initiation timestamp'] >= joined_data['call_time_lower']))]

print('Proportion of matched data : {}'.format(len(joined_data) / len(crmdata)))
print('Number of calls matched: {}'.format(len(joined_data)))

# Save merged dataset into csv
joined_data.to_csv('./Data/merged_crm_meta.csv', index = False)

Proportion of matched data : 0.7841845140032949
Number of calls matched: 476


In [11]:
df = pd.read_csv('./Data/merged_crm_meta.csv')
df

Unnamed: 0,call_date,call_month,call_time,agent_name,category_name,call_direction,policy_number,subcategory1,subcategory2,customer_phone,...,Agent,Customer phone number,Disconnect timestamp,Contact duration,month,ts_y,ts_end,Customer phone number2,call_time_lower,call_time_upper
0,2021-11-29,11,2021-11-29 11:40:12,Tracy Hwang Poh Sim,Service,Inbound,PNTR2021-00011604,Self Help - Assist,Cancellation,81271736,...,tracy.hwang@fwd.com,581271736,2021-11-29 11:43:19,00:04:24,11,1638185934,1638186199,81271736,2021-11-29 11:38:12,2021-11-29 11:42:12
1,2021-11-29,11,2021-11-29 12:10:35,Grace Chong Wei Hui,Service,Inbound,PNTR2021-00007642,Self Help - Assist,POI extension,87690767,...,grace.chong@fwd.com,587690767,2021-11-29 12:15:26,00:06:14,11,1638187751,1638188126,87690767,2021-11-29 12:08:35,2021-11-29 12:12:35
2,2021-11-30,11,2021-11-30 15:12:35,Marissa Binte Ismail,Service,Inbound,PNTR2021-00007446,Self Help - Assist,Cancellation,97530439,...,marissa.ismail@fwd.com,597530439,2021-11-30 15:14:16,00:03:09,11,1638285066,1638285256,97530439,2021-11-30 15:10:35,2021-11-30 15:14:35
3,2021-11-30,11,2021-11-30 16:39:19,Pamella Krystle Bonayon Flores,Service,Inbound,PNTR2021-00012679 | PNTC2021-00007490,Self Help - Assist,Download Documents,98472116,...,pamella.flores@fwd.com,598472116,2021-11-30 16:50:14,00:12:26,11,1638290267,1638291014,98472116,2021-11-30 16:37:19,2021-11-30 16:41:19
4,2021-11-30,11,2021-11-30 17:51:39,Pamella Krystle Bonayon Flores,Service,Inbound,PNTR2021-00009818 | PNTC2021-00005382,Self Help - Assist,Cancellation,91820838,...,pamella.flores@fwd.com,591820838,2021-11-30 18:05:47,00:15:32,11,1638294614,1638295547,91820838,2021-11-30 17:49:39,2021-11-30 17:53:39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,2021-10-14,10,2021-10-14 16:05:04,Go Irish Dianne Vinaviles,Service,Inbound,PNMC2021-00004499,Self Help - Assist,Basic Information Endorsement,81801342,...,irish.go@fwd.com,581801342,2021-10-14 16:12:58,00:09:04,10,1634227434,1634227978,81801342,2021-10-14 16:03:04,2021-10-14 16:07:04
472,2021-08-02,8,2021-08-02 11:13:40,Go Irish Dianne Vinaviles,Service,Inbound,PNPA2021-00000686,Self Help - Assist,Claim Submission,66128224,...,irish.go@fwd.com,566128224,2021-08-02 11:21:57,00:09:26,8,1627902750,1627903317,66128224,2021-08-02 11:11:40,2021-08-02 11:15:40
473,2021-08-02,8,2021-08-02 12:41:49,Go Irish Dianne Vinaviles,Service,Inbound,PNMC2019-00001487-02,Self Help - Assist,Cancellation,86115095,...,irish.go@fwd.com,586115095,2021-08-02 12:49:38,00:08:59,8,1627908039,1627908578,86115095,2021-08-02 12:39:49,2021-08-02 12:43:49
474,2021-08-18,8,2021-08-18 21:47:18,Villegas Karl Geoff Duques,Service,Inbound,,Self Help - Assist,Cancellation,92368410,...,karl.villegas@fwd.com,592368410,2021-08-18 21:54:54,00:08:54,8,1629323159,1629323694,92368410,2021-08-18 21:45:18,2021-08-18 21:49:18
