In [1]:
%run ConfigFile.ipynb  ##import ConfigFile as cfg
%run DB_Connection.ipynb ##import DB_Connection as dbc
import pyodbc
import copy
import math
from datetime import datetime
import logging
import time
import os
import sys

import pandas as pd
import numpy as np
import re



import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2
from sklearn.metrics import f1_score

In [2]:
#creating a unique id with the help of current date timestamp
st = datetime.fromtimestamp(time.time()).strftime('%d%m%Y%H%M%S')
clientname=client_name['clientname']
logging.basicConfig(filename=clientname+"_"+st+".log",level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger(clientname)

<Logger SAAB (DEBUG)>

### Keyphrases

In [3]:
introduction_section = ["how may I help","welcome to","thank you for calling","i am calling from","i can help you","i can assist you with"]
mid_section = ["for verification purpose","I can help you","I will help you","i can assist you with","apologize for","for the inconvenience"]
closure_section = ["thank you for calling","have a pleasant day","have a nice day","i will transfer the call","is there anything else","i can assist you with"]

### Function to get the Speaker IDs which matches the agent script

In [6]:
def get_speaker_list(sentences_list,keyphrase_list,temp_data):
    indexes_list = []
    speaker_list = []
    for phrase in keyphrase_list:
        indexes_list = [sentences_list.index(x) if phrase in x else '' for x in sentences_list]
        for i in indexes_list:
            if (isinstance(i,int)):
                speaker_list.append(temp_data.loc[i,'SpeakerId'])      
    return speaker_list

### Main function

In [11]:
def main():
    logging.debug("Starting....")
    #Reading data from local system and azure data
    try:
        logging.debug("Reading data....")
        data = fetch_data("select * from [dbo].[Fact_Audio_Insights] order by [Call_ID],[StartTime];")
    except Exception as err:
        if "Could not open a connection to SQL Server" in str(err):
            logging.error("Could not connect to data warehouse : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        else:
            logging.error("Error occured with database :"+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    try:
        distict_call_ids = list(data['Call_ID'].unique())
        
        logging.debug("CallId identification in progress....")
        call_id_agent_mapping_dict = {}
        for call_id in distict_call_ids:

            temp_data = data[data['Call_ID']==call_id][['Call_ID','SpeakerId','StartTime','Display']]

            # Introduction section check for agent spoken phrases
            intro_speaker_list = get_speaker_list(list(temp_data['Display'][:5]),introduction_section,temp_data)   

            # Mid section check for agent spoken phrases
            mid_speaker_list = get_speaker_list(list(temp_data['Display'][5:-5]),mid_section,temp_data)

            # Closure section check for agent spoken phrases
            closure_speaker_list = get_speaker_list(list(temp_data['Display'][-5:]),closure_section,temp_data)

            # Adding all the most occcuring Speaker IDs from diffent sections into a final list
            final_spk_list = []
            if intro_speaker_list:
                final_spk_list.append(max(intro_speaker_list,key=intro_speaker_list.count))
            if mid_speaker_list:
                final_spk_list.append(max(mid_speaker_list,key=mid_speaker_list.count))
            if closure_speaker_list:
                final_spk_list.append(max(closure_speaker_list,key=closure_speaker_list.count))

            # Creating a dictionary containing the SpeakerID flagged as agent
            if final_spk_list:
                call_id_agent_mapping_dict[call_id] = max(final_spk_list,key=final_spk_list.count)
            else:
                call_id_agent_mapping_dict[call_id] = np.nan
            
            logging.debug("Assigning labels in progress....")
            copy_data = copy.deepcopy(data)
            copy_data['Agent_ID'] = copy_data['Call_ID'].map(call_id_agent_mapping_dict)
            copy_data['Labels'] = np.where(copy_data['SpeakerId'] == copy_data['Agent_ID'],'A','C')
            call_speaker_labels_df = copy_data[['Call_ID','SpeakerId','Labels','StartTime']].groupby(['Call_ID','SpeakerId','Labels']).count().reset_index().drop('StartTime',axis=1)
            print(call_speaker_labels_df)
    except Exception as err:
        logging.error("Error occured : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)

In [12]:
if __name__ == "__main__":
    main()

DEBUG - Starting....
DEBUG - Reading data....
DEBUG - CallId identification in progress....
DEBUG - Assigning labels in progress....
ERROR - Error occured : (0,)
Traceback :<built-in method with_traceback of KeyError object at 0x0000024760504F68>


        Call_ID  SpeakerId Labels
0      56482635          1      C
1      56482635          2      A
2      61876574          1      C
3      61876574          2      C
4      85550179          1      C
..          ...        ...    ...
176  2071978881          2      C
177  2074449487          1      C
178  2074449487          2      C
179  2146526476          1      C
180  2146526476          2      C

[181 rows x 3 columns]


SystemExit: 1