In [1]:
%run ConfigFile.ipynb
%run DB_Connection.ipynb

import pyodbc
import copy
import math
from datetime import datetime
import logging
import time
import os
import sys
import ast
import json

import pandas as pd
import numpy as np
import re

# Text preprocessing imports
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
#creating a unique id with the help of current date timestamp
st = datetime.fromtimestamp(time.time()).strftime('%d%m%Y%H%M%S')
clientname=client_name['clientname']
logging.basicConfig(filename=clientname+"_"+st+".log",level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger(clientname)

<Logger SAAB (DEBUG)>

In [3]:
today_timestamp = datetime.now()

### Preprocessing

In [4]:
'''
    This function preprocesses the transcript column to remove punctuation, whitespaces and to lowercase & lemmatize them 
    Input:
        data : the entire dataframe on which the preprocessing needs to be performed
    Output:
        preprocessed list of transcripts
'''
def preprocess_text(input_dataframe,target_column_number):
    # Preprocessing text
    lemmatized_text = []
    count = 0
    for row in input_dataframe.itertuples():
#         print(row)
        # Removing extra spaces, special characters and punctuation marks and lower casing the text
        clean_text = re.sub(r'\s\s+',r' ',re.sub(r'[?|$|.|!|#|%|^|*|;|,|+|-|_|=|&]',r'',row[target_column_number].lower()))
        # Lemmatizing the text
        
        lemmatized_text.append(" ".join([str(token.lemma_).replace('-PRON-',str(token)) for token in nlp(str(clean_text))]))
    return lemmatized_text

### Script adherance check 

In [5]:
'''
    This function is for running through all the rows of the agent transcriptsad assign different script adherance scores
    Input:
        data : the entire dataframe on which the preprocessing needs to be performed
    Output:
        Series of scores of all the script adherance for each call
'''
def adherance_check(row_data,adherance):#row_data,
    # Retrieving lsit of keywords for each kind of adherance
    adherance_check_list = {}
    for row in adherance.itertuples():
        for section in ast.literal_eval(row[5]):
            sections_score = {}
            if(section=='Greetings'):
                    row_data.split(":")[:5]
            elif(section=='Closure'):
                row_data.split(":")[5:]
            for phrase in ast.literal_eval(row[5])[section]:
                if phrase in row_data:
                    sections_score[section] = 1

        adherance_check_list[row[3]] = round(len(sections_score)/len(ast.literal_eval(row[5])),2)
        
    return adherance_check_list

### Writing to DW

In [6]:
'''
    Function to write the dataframe to DW
  
    Input :
        script_adherance_df : dataframe that consists of scores of the scripte adherance checks for each call
            
'''
def write_df_to_dw(script_adherance_df):
    # Writing to DW
    today_timestamp = datetime.now()
    server = 'saab-server-resource.database.windows.net'
    database = 'SAAB_DW_Resource'
    username = 'saabadmin'
    password = 'p@$$w0rd'
    conn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = conn.cursor()
    id_count = 0
    for index,row in script_adherance_df.iterrows():
        id_count += 1
        cursor.execute("INSERT INTO [dbo].[SAAB_ML_SCRIPT_ADHERANCE_FT]([RESULT_ID],[DOMAIN_ID],[SCRIPT_ID],[CALL_ID],[SCORE],[CREATED_DATE],[CREATED_BY]) values (?, ?, ?, ?, ?, ?, ?)", id_count, row['DOMAIN_ID'], row['SCRIPT_ID'], row['CALL_ID'], row['SCORE'],row['CREATED_DATE'],row['CREATED_BY']) 
    conn.commit()
    cursor.close()
    conn.close()

### Main function

In [9]:
def main():
    logging.debug("Starting....")
    try:
        domain_id = 101
        logging.debug("Reading Data in progress....")
        raw_transcript_data = fetch_data("select * from [dbo].[Fact_Audio_Insights] order by [Call_ID],[StartTime];")
        agent_label_data = fetch_data("select * from [dbo].[SAAB_ML_SPEAKER_MAPPING_FT] order by Result_ID")
        adherance_data = fetch_data("select * from [dbo].[SAAB_ML_MASTER_SCRIPT_ADHERANCE_DM] where DOMAIN_ID = " + str(domain_id))
    except Exception as err:
        if "Could not open a connection to SQL Server" in str(err):
            logging.error("Could not connect to data warehouse : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        else:
            logging.error("Error occured with database :"+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    try:
        domain_id = 101
        created_by = "MDXC"
        # Assign respective agent and customer transcripts
        for row in agent_label_data.itertuples():
            agent_label_data.loc[row[0],'Transcripts'] = ":".join(list(raw_transcript_data[((raw_transcript_data['Call_ID']==row[2]) & (raw_transcript_data['SpeakerId']==row[4]))].Display))
            
        # Preprocess transcripts
        logging.debug("Preprocessing in progress....")
        agent_label_data['Lemmatized_Transcript']  = preprocess_text(agent_label_data,10)
        
        # Scoring for script adherance
        agent_label_data['Adherance_Scores'] = np.where(agent_label_data['LABEL']=='A',agent_label_data['Lemmatized_Transcript'].apply(adherance_check,adherance=adherance_data),"")
        
        # Preparing final script sdherance dataframe
        logging.debug("Script Adherance check in progress....")
        script_ft_df = pd.concat([pd.DataFrame(agent_label_data[agent_label_data['LABEL']=='A'].DOMAIN_ID),pd.DataFrame(agent_label_data[agent_label_data['LABEL']=='A'].CALL_ID),agent_label_data[agent_label_data['LABEL']=='A'].Adherance_Scores.apply(pd.Series)],axis=1)
        script_ft_df = script_ft_df.melt(id_vars=["DOMAIN_ID","CALL_ID"], var_name="SCRIPT_ID",value_name="SCORE").sort_values('CALL_ID')
        script_ft_df['CREATED_DATE'] = today_timestamp
        script_ft_df['CREATED_BY'] = created_by
        
        # Writing dataframe to DW
        logging.debug("Writing to DW in progress....")
        write_df_to_dw(script_ft_df)
        
    except Exception as err:
        logging.error("Error occured : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    finally:
        logging.debug("End of script adherance check")

In [10]:
if __name__ == "__main__":
    main()