In [41]:
%run ConfigFile.ipynb  ##import ConfigFile as cfg
%run DB_Connection.ipynb ##import DB_Connection as dbc
import pyodbc
import copy
import math
from datetime import datetime
import logging
import time
import os
import sys

import pandas as pd
import numpy as np
import re

# Text preprocessing imports
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

# Modelling and evaluation imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [43]:
#creating a unique id with the help of current date timestamp
st = datetime.fromtimestamp(time.time()).strftime('%d%m%Y%H%M%S')
clientname=client_name['clientname']
logging.basicConfig(filename=clientname+"_"+st+".log",level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger(clientname)

<Logger SAAB (DEBUG)>

### Preprocessing

In [56]:
'''
    This function preprocesses the transcript column to remove punctuation, whitespaces and to lowercase & lemmatize them 
    Input:
        data : the entire dataframe on which the preprocessing needs to be performed
    Output:
        preprocessed list of transcripts
'''
def preprocess_text(input_dataframe,target_column_number):
    # Preprocessing text
    lemmatized_text = []
    count = 0
    for row in input_dataframe.itertuples():
        # Removing extra spaces, special characters and punctuation marks and lower casing the text
        clean_text = re.sub(r'\s\s+',r' ',re.sub(r'[?|$|.|!|#|%|^|*|:|;|,|+|-|_|=|&]',r'',row[target_column_number].lower()))
        # Lemmatizing the text
        lemmatized_text.append(" ".join([str(token.lemma_).replace('-PRON-',str(token)) for token in nlp(str(clean_text))]))
    return lemmatized_text

### Stratified sampling

In [45]:
'''
    This function performs a stratified sampling on the input data frame 
    Input:
        input_dataframe : on which the sampling needs to be performed
    Output:
        Returns the train data, train labels, test data and test labels
'''
def stratified_sampling(input_dataframe):
    X = input_dataframe['Lemmatized_Text']
    y = input_dataframe['SpeakerId']
    sss = StratifiedShuffleSplit(n_splits=10,test_size=0.2,random_state=32)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return  X_train, y_train, X_test, y_test

### Feature selection

In [46]:
'''
    This function performs a featre selection based on the chi2 scores to exclude less weightage score
    Input:
        X_train : train data
        y_train : train labels
        no_of_features : 
        threshold = chi2 square threshold value
    Output:
        Dataframe of the best features
'''
def feature_selection(X_train,y_train,no_of_features,chi_square_threshold):
    # Create dummies for classes
    y_train_dummies = pd.get_dummies(y_train)
    
    # Convert to token counts
    count_vec = CountVectorizer(ngram_range=(2,3),binary=True)
    # count_vec = CountVectorizer(binary=True)
    X_train_count = count_vec.fit_transform(X_train)
    transformer = TfidfTransformer()
    X_train_transformed = transformer.fit_transform(X_train_count)
    
    columns = list(y_train_dummies.columns)
    features_final=[]
    chi_probability = []
    for col in columns:
        chi_score = chi2(X_train_transformed, y_train_dummies[col])[0]
        chi_prob = chi2(X_train_transformed, y_train_dummies[col])[1]
        features = count_vec.get_feature_names()
        chi_table = pd.DataFrame({'Features':features, 'Chisquare':chi_score , 'Chi_Square_Prob':chi_prob})
        chi_table_cutoff = chi_table.loc[(chi_table["Chisquare"] > chi_square_threshold)] 
        if len(chi_table_cutoff)>no_of_features:
            chi_table = chi_table_cutoff.sort_values(by='Chisquare', ascending=False).head(no_of_features)
        else:
            chi_table = chi_table_cutoff
        features_final.append(chi_table['Features'].tolist())
        chi_probability.append(chi_table['Chisquare'].tolist())
    features_all = [item for sublist in features_final for item in sublist]
    chi_values_all = [item for sublist in chi_probability for item in sublist]
    features_all_table = pd.DataFrame({'Features':features_all , 'Chisquare':chi_values_all})
    features_best_chi = pd.DataFrame(features_all_table.groupby(['Features'], as_index=False, sort=False)['Chisquare'].max())
    
    return  features_best_chi

### Find best k features

In [47]:
'''
    This function identifies the best numebr features to be used
    Input:
        features_best_chi : best features by chi2 score
        X_train : train data
        y_train : train labels
        X_test : test data
        y_test : test labels
        
    Output:
        Return the optimal number of features to consider
'''
def find_best_k_features(features_best_chi,X_train,y_train,X_test,y_test):
    k_val = features_best_chi.shape[0]
    scores_f1 = []
    for k in range(1, int(k_val), 1):
        features_table = features_best_chi.sort_values('Chisquare', ascending=False).head(k)
        # Bi gram
        model = Pipeline([('vect', CountVectorizer(vocabulary=features_table.Features,ngram_range=(2,3))),
                         ('tfidf', TfidfTransformer()),
                         ('clf',  CalibratedClassifierCV(LinearSVC(penalty="l2", dual=False, tol=1e-3),cv=KFold(n_splits=3))),])#LinearSVC(penalty="l2", dual=False,tol=1e-3)
        # Training the Model
        model.fit(X_train, y_train)
        # Scoring the Model
        predicted = model.predict(X_test)
        scores_f1.append(f1_score(y_test, predicted, average='weighted'))

    kvals_list = list(range(1,int(k_val),1))

    scores_f1_table = pd.DataFrame({'K_values':kvals_list, 'F1-Score':scores_f1})
    kval_top = scores_f1_table.sort_values(by='F1-Score', ascending=False).head(n=10)
    kval_optimum = int(kval_top.iloc[0,0])
    
    return kval_optimum

### Training

In [48]:
'''
    This function trains the final model
    Input:
        features_table_chi_top : top best features selected
        X_train : train data
        y_train : train labels
        
    Output:
        Returns the trained model
'''
def train_model(features_table_chi_top,X_train,y_train):
    #Using calibrated classifier 
    model = Pipeline([('vect', CountVectorizer(vocabulary=features_table_chi_top.Features,ngram_range=(2,3))),
                         ('tfidf', TfidfTransformer()),
                         ('clf',  CalibratedClassifierCV(LinearSVC(penalty="l2", dual=False,tol=1e-3),cv=KFold(n_splits=3)))])

    # Training the Model
    trained_model = model.fit(X_train, y_train)
    return trained_model

In [49]:
# Function returns the count of agent labels for each of the speaker
def get_speaker_A_count(section_spk_list,pred_labels_list):
    labels = {}
    spk_2_count = 0
    spk_1_count = 0
    for i in list(zip(section_spk_list,pred_labels_list)):
        if ((i[0] == 2) and (i[1] == 'A')):
            spk_2_count += 1
        elif ((i[0]==1 and (i[1]=='A'))):
            spk_1_count += 1
    labels['Spk_2_A_Count'] = spk_2_count
    labels['Spk_1_A_Count'] = spk_1_count
    return labels

# Function returns the count of customer labels for each of the speaker
def get_speaker_C_count(section_spk_list,pred_labels_list):
    labels = {}
    spk_2_count = 0
    spk_1_count = 0
    for i in list(zip(section_spk_list,pred_labels_list)):
        if ((i[0] == 2) and (i[1] == 'C')):
            spk_2_count += 1
        elif ((i[0]==1 and (i[1]=='C'))):
            spk_1_count += 1
    labels['Spk_2_C_Count'] = spk_2_count
    labels['Spk_1_C_Count'] = spk_1_count
    return labels

In [50]:
'''
# Function that assigns the agent/customer label to the speaker ids by 
  evaluating the intro, closure and mid sections of the transcription
  
    Input :
        agent_count_dict : Dictionary with counts of the speakers classified as Agent
        cust_count_dict : Dictionary with counts of the speakers classified as Customer
    Output :
        Return the final speaker ids associated to both Agent and Customer
'''
def assign_speaker_by_section(agent_count_dict,cust_count_dict):
    
    agent_speaker = np.nan
    cust_speaker = np.nan

    if (((agent_count_dict['Spk_1_A_Count']) > (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) > (agent_count_dict['Spk_2_A_Count']))):
        agent_speaker = 1
        cust_speaker = 2
    elif (((agent_count_dict['Spk_1_A_Count']) > (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) == (agent_count_dict['Spk_2_A_Count']))):
        agent_speaker = 1
        cust_speaker = 2
    elif (((agent_count_dict['Spk_1_A_Count']) > (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) < (agent_count_dict['Spk_2_A_Count']))):
        if (agent_count_dict['Spk_2_A_Count'] > agent_count_dict['Spk_1_A_Count']):
            agent_speaker = 2
            cust_speaker = 1
        elif ((agent_count_dict['Spk_2_A_Count'] < agent_count_dict['Spk_1_A_Count'])):
            agent_speaker = 1
            cust_speaker = 2
    elif (((agent_count_dict['Spk_1_A_Count']) == (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) > (agent_count_dict['Spk_2_A_Count']))):
        agent_speaker = 1
        cust_speaker = 2
    elif (((agent_count_dict['Spk_1_A_Count']) == (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) == (agent_count_dict['Spk_2_A_Count']))):
        pass
    elif (((agent_count_dict['Spk_1_A_Count']) == (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) < (agent_count_dict['Spk_2_A_Count']))):
#         print("this condition")
        agent_speaker = 2
        cust_speaker = 1        
    elif ((agent_count_dict['Spk_1_A_Count'] < cust_count_dict['Spk_1_C_Count']) and (cust_count_dict['Spk_2_C_Count'] > agent_count_dict['Spk_2_A_Count'])):
        if (cust_count_dict['Spk_2_C_Count'] > cust_count_dict['Spk_1_C_Count']):
            agent_speaker = 1
            cust_speaker = 2
        elif ((cust_count_dict['Spk_2_C_Count'] < cust_count_dict['Spk_1_C_Count'])):
            agent_speaker = 2
            cust_speaker = 1
    elif (((agent_count_dict['Spk_1_A_Count']) < (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) == (agent_count_dict['Spk_2_A_Count']))):
        agent_speaker = 2
        cust_speaker = 1
    elif (((agent_count_dict['Spk_1_A_Count']) < (cust_count_dict['Spk_1_C_Count'])) and ((cust_count_dict['Spk_2_C_Count']) < (agent_count_dict['Spk_2_A_Count']))):
        agent_speaker = 2
        cust_speaker = 1
    return agent_speaker,cust_speaker

In [59]:
'''
    Function predicts and assigns 
  
    Input :
        number_of_turns_considered : input parameter to define boundaries for intro,mid and closure section
        azure_data : unseen data
        trained_model : trained model
    Output :
        Returns dictonary of call ids and the respective agent label
'''
def get_call_agent_speaker_id(number_of_turns_considered,azure_data,azure_distict_call_ids,trained_model):
    call_agent_ids = {}
    for call_id in azure_distict_call_ids:

        temp_data = azure_data[azure_data['Call_ID']==call_id][['Call_ID','SpeakerId','StartTime','Display','Lemmatized_Text']]
        labels = trained_model.predict(temp_data['Lemmatized_Text'])    
        # Assuming always will have a 9 or greater turns
        if len(temp_data) >= 9:
            agent_spk_id = np.nan
            # Getting the speaker id counts for agent label in all the three sections
            intro_A_counts = get_speaker_A_count(list(temp_data[:number_of_turns_considered].SpeakerId),list(labels[:number_of_turns_considered])) 
            closure_A_counts = get_speaker_A_count(list(temp_data[-(number_of_turns_considered):].SpeakerId),list(labels[-(number_of_turns_considered):])) 
            mid_A_counts = get_speaker_A_count(list(temp_data[number_of_turns_considered:-(number_of_turns_considered)].SpeakerId),list(labels[number_of_turns_considered:-(number_of_turns_considered)])) 
            # Getting the speaker id counts for customer label in all the three sections
            intro_C_counts = get_speaker_C_count(list(temp_data[:number_of_turns_considered].SpeakerId),list(labels[:number_of_turns_considered])) 
            closure_C_counts = get_speaker_C_count(list(temp_data[-(number_of_turns_considered):].SpeakerId),list(labels[-(number_of_turns_considered):])) 
            mid_C_counts = get_speaker_C_count(list(temp_data[number_of_turns_considered:-(number_of_turns_considered)].SpeakerId),list(labels[number_of_turns_considered:-(number_of_turns_considered)])) 

            intro_agent_id , intro_cust_id = assign_speaker_by_section(intro_A_counts,intro_C_counts)
            closure_agent_id , closure_cust_id = assign_speaker_by_section(closure_A_counts,closure_C_counts)

            # First check intro and closure section for majority count of the speaker ids and then tie breaker using the mid section
            if (not math.isnan(intro_agent_id) and not math.isnan(closure_agent_id)):
                if intro_agent_id == closure_agent_id:
                    agent_spk_id = intro_agent_id
                else:
                    mid_agent_id , mid_cust_id = assign_speaker_by_section(mid_A_counts,mid_C_counts)
                    if not math.isnan(mid_agent_id):
                        agent_spk_id = mid_agent_id
                    else:
                        agent_spk_id = intro_agent_id

            elif not math.isnan(intro_agent_id):
                agent_spk_id = intro_agent_id
            elif not math.isnan(closure_agent_id):
                agent_spk_id = closure_agent_id
            else:
                mid_agent_id , mid_cust_id = assign_speaker_by_section(mid_A_counts,mid_C_counts)
                if not math.isnan(mid_agent_id):
                    agent_spk_id = mid_agent_id

        call_agent_ids[call_id] = agent_spk_id
    return call_agent_ids

### Write to DW

In [52]:
'''
    Function to write the dataframe to DW
  
    Input :
        call_speaker_labels_df : dataframe that consists of speakr labels for each call
            
'''
def write_df_to_dw(call_speaker_labels_df):
    # Writing to DW
    today_timestamp = datetime.now()
    server = 'saab-server-resource.database.windows.net'
    database = 'SAAB_DW_Resource'
    username = 'saabadmin'
    password = 'p@$$w0rd'
    conn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = conn.cursor()
    id_count = 0
    for index,row in call_speaker_labels_df.iterrows():
        id_count += 1
        cursor.execute("INSERT INTO [dbo].[SAAB_ML_SPEAKER_MAPPING_FT]([RESULT_ID],[CALL_ID],[DOMAIN_ID],[SPEAKER_ID],[LABEL],[CREATED_DATE],[CREATED_BY]) values (?, ?, ?, ?, ?, ?, ?)", id_count, row['Call_ID'],'101', row['SpeakerId'], row['Labels'], today_timestamp,'Mrinalini') 
    conn.commit()
    cursor.close()
    conn.close()

### Main function

In [60]:
def main():
    logging.debug("Starting....")
    #Reading data from local system and azure data
    try:
        manual_trasncript_data = pd.read_excel("EnglishManualTranscriptDF.xlsx",sheet_name="Sheet3")
        azure_data = fetch_data("select * from [dbo].[Fact_Audio_Insights] order by [Call_ID],[StartTime];")
    except Exception as err:
        if "Could not open a connection to SQL Server" in str(err):
            logging.error("Could not connect to data warehouse : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        else:
            logging.error("Error occured with database :"+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
        
    try:
        # Replacing a lowercase labels to an uppercase label
        manual_trasncript_data.loc[manual_trasncript_data[manual_trasncript_data['SpeakerId']=="c"].index[0],'SpeakerId'] = 'C'

        # Preprocess text
        logging.debug("Preprocessing in progress....")
        lemmatized_text = preprocess_text(manual_trasncript_data,3)
        manual_trasncript_data['Lemmatized_Text'] = lemmatized_text

        #Stratified Sampling
        logging.debug("Stratified sampling in progress....")
        X_train, y_train, X_test, y_test = stratified_sampling(manual_trasncript_data)

        # Feature selection
        logging.debug("Finding best features in progress....")
        features_best_chi = feature_selection(X_train,y_train,1000,1)

        # Finding best K-features
        kval_optimum = find_best_k_features(features_best_chi,X_train, y_train, X_test, y_test)

        #Training data with optimal features
        logging.debug("Training in progress....")
        features_table_chi_top = features_best_chi.sort_values('Chisquare', ascending=False).head(kval_optimum)
        final_trained_model = train_model(features_table_chi_top,X_train, y_train)

        # Predict on test
        logging.debug("Prediction and scoring in progress....")
        predicted_labels = final_trained_model.predict(X_test)
        prec_recall_fscore_support = precision_recall_fscore_support(y_test, predicted_labels)
        accuracy = accuracy_score(y_test,predicted_labels)
        
        # Prediction on the Azure Transcripts - Unseen Data
        logging.debug("Prediction on unseen data in progress....")
        azure_lemmatized_text = preprocess_text(azure_data,7)
        azure_data['Lemmatized_Text'] = azure_lemmatized_text
        azure_distict_call_ids = list(azure_data['Call_ID'].unique())

        call_agent_ids = get_call_agent_speaker_id(5,azure_data,azure_distict_call_ids,final_trained_model)
        copy_azure_data = copy.deepcopy(azure_data)
        copy_azure_data['Agent_ID'] = copy_azure_data['Call_ID'].map(call_agent_ids)
        copy_azure_data['Labels'] = np.where(copy_azure_data['SpeakerId'] == copy_azure_data['Agent_ID'],'A','C')
        call_speaker_labels_df = copy_azure_data[['Call_ID','SpeakerId','Labels','StartTime']].groupby(['Call_ID','SpeakerId','Labels']).count().reset_index().drop('StartTime',axis=1)
        
        logging.debug("Writing to DW in progress....")
        write_df_to_dw(call_speaker_labels_df)
        
    except Exception as err:
#         print(err.with_traceback)
        logging.error("Error occured : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    finally:
        logging.debug("End of identification")

In [61]:
if __name__ == "__main__":
    main()

DEBUG - Starting....
DEBUG - Preprocessing in progress....
DEBUG - Stratified sampling in progress....
DEBUG - Finding best features in progress....
DEBUG - Training in progress....
DEBUG - Prediction and scoring in progress....
DEBUG - Prediction on unseen data in progress....
DEBUG - Writing to DW in progress....
