In [1]:
#import packages
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import random, re
import operator
import pickle
import locale


locale.setlocale( locale.LC_ALL, '' )


'C/UTF-8/C/C/C/C'

In [20]:
def plot_cost_benefit(data, Current_spent, show_best=False):
    """
    Plot cost benefit analysis against proportion of customers contacted (pred=1)
    """
    utility = [0] + list(data.saving)#/ 100
    
    NUM_SAMPLES=len(data)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.grid('on')
    x_axis = np.linspace(0, 100, NUM_SAMPLES + 1)
    ax.fill_between(x_axis, utility, facecolor='navy', alpha=.7, 
                    label='Expected utility')
    ax.plot(x_axis, utility, color='navy', alpha=.7)

    ax.set_xlim([0, 100])
    ax.set_xticks(np.arange(0, 100, 10.0))
    
    #ax.set_ylim([(Current_spent + (Current_spent*0.25)), 0])
    #ax.spines['left'].set_position(('data', 0))
    #ax.spines['bottom'].set_position(('data', Current_spent))
    #ax.xaxis.labelpad = 80
    #ax.set_ylabel('Cost  [ £k ]')  
    
    ax.set_xlabel('% of Automated Queries')
       
    ax.set_ylabel('% of cost saved')
    
    ax.legend()
    ax.grid(True)
    plt.title('Utility Cost Benefit Analysis')
    plt.tight_layout()
    plt.show()
    #plt.savefig('figures/cost_benefit_analysis.png', bbox_inches='tight')
    if show_best:
        lis = [(x_axis[i], utility[i]) for i in range(len(x_axis))]
        lis = lis[1:]
        best = max(lis, key=lambda x: x[1])
        
        utility_number = [0] + list(data.cumul_cost)
        lis2 = [(x_axis[i], utility_number[i]) for i in range(len(x_axis))]
        lis2 = lis2[1:]
        #print(lis2)
        #print(type(lis2))
        best2 = max(lis2, key=lambda x: x[1])
        max_save_rate = round(best[1],2)
        max_aut_queries = best[0]
        amount_saved = best[1]*(-Current_spent)/100
        print(f'Maximum saving rate is {round(best[1],2)}% if {best[0]:.2f}% of queries is automated')
        print(f'This accounts for {locale.currency(best[1]*(-Current_spent)/100)} saved')
        #print(f'This accounts for {locale.currency(best2[1])} saved')
        return max_save_rate, max_aut_queries, amount_saved

def cost_benefit_analysis(y_true, y_pred,
        perc_contacted, 
        total_applicable_customers,
        cost_of_reviewing_query,
        cost_of_false_negative, 
        do_eval=False,
        plot_utility=False,
        show_best=False):
    """Run the evaluate function only on the contacted subset.
    Clearly there will be no TN or FN values. Is this correct?
    
    Returns
    -------
    reward - of the cost model at a threshold for these probas
    data - dataframe with cumulative cost and values sorted by probas
    """
    
    if type(cost_of_reviewing_query) not in (int, float):
        raise TypeError(f'cost_of_reviewing_query {cost_of_reviewing_query} not valid. Please input a number.')  
    elif cost_of_reviewing_query <0:
        raise InputError(f'cost_of_reviewing_query{cost_of_reviewing_query} less than 0. Please input 0 or a positive number.') 
  
    if type(cost_of_false_negative) not in (int, float):
        raise TypeError(f'cost_of_false_negative {cost_of_false_negative} not valid. Please input a number.')
    elif cost_of_false_negative <0:
        raise InputError(f'cost_of_false_negative{cost_of_false_negative} less than 0. Please input 0 or a positive number.') 
 
    if (perc_contacted <0 or perc_contacted>100):
        raise PercentageError(f"Percentage value {perc_contacted} outside of the limits. Please input a number between 0 and 100.") 
        
    if total_applicable_customers <=0:
        raise InputError(f"Number of total customers cannot be less than 0 or 0. Current number: {total_applicable_customers}")
       
    
    #print('exceptions run')
    VALUE_TRUE_POSITIVE = cost_of_reviewing_query
    VALUE_FALSE_POSITIVE = (-cost_of_reviewing_query)
    VALUE_TRUE_NEGATIVE = (-cost_of_reviewing_query)
    VALUE_FALSE_NEGATIVE = (-cost_of_false_negative)
    
    Current_spent = VALUE_FALSE_POSITIVE*total_applicable_customers
    Current_spent *= total_applicable_customers / len(y_true)
    #print(f'Current spending in pounds: {locale.currency(abs(Current_spent))}')
    #print('Current_spent')
    
    k = int(len(y_true)*perc_contacted / 100)
    a = list(zip(y_pred,y_true))
    a.sort(key=lambda x: x[0], reverse=True)
    
    true_positives = sum(map(lambda x: x[1], a[:k]))
    false_positives = k - true_positives
    false_negatives = sum(map(lambda x: x[1], a[k:]))
    true_negatives = len(y_true) - k - false_negatives
        
    k = int(perc_contacted*len(y_pred) /100)
    a = list(zip(y_pred,y_true.values))
    a.sort(key=lambda x: x[0], reverse=True)
    data = pd.DataFrame(a, columns=['pred_proba', 'true'])
    
    TPFPTNFN = pd.DataFrame(columns=['TP','FP','TN','FN'])
    
    for i in range(len(a)):
        if (round(a[i][0])==1 and a[i][1]==1):
            TPFPTNFN = TPFPTNFN.append({'TP': 1}, ignore_index=True)
        elif (round(a[i][0])==0 and a[i][1]==0):
                TPFPTNFN = TPFPTNFN.append({'TN': 1}, ignore_index=True)
        elif (round(a[i][0])==1 and a[i][1]==0):
                TPFPTNFN = TPFPTNFN.append({'FP': 1}, ignore_index=True)
        elif (round(a[i][0])==0 and a[i][1]==1):
                TPFPTNFN = TPFPTNFN.append({'FN': 1}, ignore_index=True)
                
    TPFPTNFN = TPFPTNFN.fillna(0, inplace=False)
    
    data['cost'] = TPFPTNFN.apply(lambda x: VALUE_TRUE_POSITIVE if x['TP'].astype(int) \
                                     else (VALUE_FALSE_POSITIVE if x['FP'].astype(int) \
                                      else (VALUE_TRUE_NEGATIVE if x['TN'].astype(int) \
                                      else VALUE_FALSE_NEGATIVE)), axis=1)
    
    data['cost'] *= total_applicable_customers / len(y_true)
    data['cumul_cost'] = data.cost.cumsum()
   
  # adj = list()
  # for i in range(len(y_true)):
  #     adj.append(data['cumul_cost'][i] - abs(Current_spent * (1-i/len(y_true))))
  # data['adj'] = adj 
   
   #data['saving'] = (1-data['adj']/Current_spent) * 100
    data['saving'] = data['cumul_cost']/abs(Current_spent) * 100
#    
#    if perc_contacted == 0:
#        print("Gain by automating %d%% of the queries in Pounds: %s" % (perc_contacted, locale.currency(Current_spent)))
#    else:
#        perc_idx = int(perc_contacted*len(data)/100)
#        if perc_idx == 0:
#            perc_idx = 1
#        #print(f'perc_idx={perc_idx}')
#        print("Savings by automating %d%% of the queries in Pounds: %s" 
#              % (perc_contacted,locale.currency(data['cumul_cost'][perc_idx-1])))  
#         
#    if do_eval:
#        evaluate( None, data['true'], None, 
#             preds=[1] * k + [0] * (len(data) - k), 
#             probas=list(data['pred_proba'].values))
    
    return data, a, TPFPTNFN

    
#plot_cost_benefit(data, Current_spent, show_best)

In [9]:
#import train and test data sets:
X_train_T = pd.read_csv('X_test.csv')
X_train_T = X_train_T.drop(["Unnamed: 0"],axis = 1)

X_test_T = pd.read_csv('X_test.csv')
X_test_T = X_test_T.drop(["Unnamed: 0"],axis = 1)
y_test_T = pd.read_csv('y_test.csv', header=None)[1]

Textmodel = pickle.load(open('textBased_model.sav', 'rb'))

In [15]:
probas = Textmodel.predict_proba(X_test_T)[:,1]

#Choose percentage to automate:
PERCENT_TO_AUTOMATE = 10

#State cost of reviewing queries and cost of missing an escalation:
cost_of_reviewing_query = 5 #This part can be modified
cost_of_false_negative = 10 # This part can be modified

#Total number of queries reviewed:
total_no_queries = 45000

In [21]:
data, a, TPFPTNFN = cost_benefit_analysis(y_test_T,
                                             probas,
                                             perc_contacted=PERCENT_TO_AUTOMATE,
                                             total_applicable_customers=total_no_queries,
                                             cost_of_reviewing_query = cost_of_reviewing_query,
                                             cost_of_false_negative = cost_of_false_negative,
                                             plot_utility=True, 
                                             show_best=True
                                              )

In [22]:
data

Unnamed: 0,pred_proba,true,cost,cumul_cost,saving
0,0.990250,1,34.997667,34.997667,0.002222
1,0.988346,1,34.997667,69.995334,0.004444
2,0.983483,1,34.997667,104.993000,0.006667
3,0.983230,1,34.997667,139.990667,0.008889
4,0.982104,1,34.997667,174.988334,0.011111
5,0.980804,1,34.997667,209.986001,0.013333
6,0.977981,1,34.997667,244.983668,0.015556
7,0.977201,1,34.997667,279.981335,0.017778
8,0.976648,1,34.997667,314.979001,0.020000
9,0.976244,1,34.997667,349.976668,0.022222


In [23]:
utility = [0] + list(data.saving)#/ 100

NUM_SAMPLES=len(data)
x_axis = np.linspace(0, 100, NUM_SAMPLES + 1)

In [27]:
df = pd.DataFrame(
{'x_axis': x_axis,
 'utility': utility
})
df

Unnamed: 0,x_axis,utility
0,0.000000,0.000000
1,0.015555,0.002222
2,0.031109,0.004444
3,0.046664,0.006667
4,0.062218,0.008889
5,0.077773,0.011111
6,0.093327,0.013333
7,0.108882,0.015556
8,0.124436,0.017778
9,0.139991,0.020000


In [41]:
df = df[df.x_axis.between(0,1)]

df.loc[:,'x_axis']

0     0.000000
1     0.015555
2     0.031109
3     0.046664
4     0.062218
5     0.077773
6     0.093327
7     0.108882
8     0.124436
9     0.139991
10    0.155545
11    0.171100
12    0.186654
13    0.202209
14    0.217763
15    0.233318
16    0.248872
17    0.264427
18    0.279981
19    0.295536
20    0.311090
21    0.326645
22    0.342199
23    0.357754
24    0.373308
25    0.388863
26    0.404417
27    0.419972
28    0.435527
29    0.451081
        ...   
35    0.544408
36    0.559963
37    0.575517
38    0.591072
39    0.606626
40    0.622181
41    0.637735
42    0.653290
43    0.668844
44    0.684399
45    0.699953
46    0.715508
47    0.731062
48    0.746617
49    0.762171
50    0.777726
51    0.793280
52    0.808835
53    0.824389
54    0.839944
55    0.855499
56    0.871053
57    0.886608
58    0.902162
59    0.917717
60    0.933271
61    0.948826
62    0.964380
63    0.979935
64    0.995489
Name: x_axis, Length: 65, dtype: float64

In [None]:

lis = [(x_axis[i], utility[i]) for i in range(len(x_axis))]
lis = lis[1:]
best = max(lis, key=lambda x: x[1])

utility_number = [0] + list(data.cumul_cost)
lis2 = [(x_axis[i], utility_number[i]) for i in range(len(x_axis))]
lis2 = lis2[1:]

best2 = max(lis2, key=lambda x: x[1])
max_save_rate = round(best[1],2)
max_aut_queries = best[0]
amount_saved = best[1]*(-Current_spent)/100
print(f'Maximum saving rate is {round(best[1],2)}% if {best[0]:.2f}% of queries is automated')
print(f'This accounts for {locale.currency(best[1]*(-Current_spent)/100)} saved')