In [3]:
#import packages
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import random, re
import operator
import pickle
import locale


locale.setlocale( locale.LC_ALL, '' )


'en_US.UTF-8'

In [14]:
def plot_cost_benefit(data, Current_spent, show_best=False):
    """
    Plot cost benefit analysis against proportion of customers contacted (pred=1)
    """
    utility = [0] + list(data.saving)#/ 100
    
    NUM_SAMPLES=len(data)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.grid('on')
    x_axis = np.linspace(0, 100, NUM_SAMPLES + 1)
    ax.fill_between(x_axis, utility, facecolor='navy', alpha=.7, 
                    label='Expected utility')
    ax.plot(x_axis, utility, color='navy', alpha=.7)

    ax.set_xlim([0, 100])
    ax.set_xticks(np.arange(0, 100, 10.0))
    
    #ax.set_ylim([(Current_spent + (Current_spent*0.25)), 0])
    #ax.spines['left'].set_position(('data', 0))
    #ax.spines['bottom'].set_position(('data', Current_spent))
    #ax.xaxis.labelpad = 80
    #ax.set_ylabel('Cost  [ £k ]')  
    
    ax.set_xlabel('% of Automated Queries')
       
    ax.set_ylabel('% of cost saved')
    
    ax.legend()
    ax.grid(True)
    plt.title('Utility Cost Benefit Analysis')
    plt.tight_layout()
    plt.show()
    #plt.savefig('figures/cost_benefit_analysis.png', bbox_inches='tight')
    if show_best:
        lis = [(x_axis[i], utility[i]) for i in range(len(x_axis))]
        lis = lis[1:]
        best = max(lis, key=lambda x: x[1])
        
        utility_number = [0] + list(data.cumul_cost)
        lis2 = [(x_axis[i], utility_number[i]) for i in range(len(x_axis))]
        lis2 = lis2[1:]
        #print(lis2)
        #print(type(lis2))
        best2 = max(lis2, key=lambda x: x[1])
        max_save_rate = round(best[1],2)
        max_aut_queries = best[0]
        amount_saved = best[1]*(-Current_spent)/100
        print(f'Maximum saving rate is {round(best[1],2)}% if {best[0]:.2f}% of queries is automated')
        print(f'This accounts for {locale.currency(best[1]*(-Current_spent)/100)} saved')
        #print(f'This accounts for {locale.currency(best2[1])} saved')
        return max_save_rate, max_aut_queries, amount_saved

def cost_benefit_analysis(y_true, y_pred,
        perc_contacted, 
        total_applicable_customers,
        cost_of_reviewing_query,
        cost_of_false_negative, 
        do_eval=False,
        plot_utility=False,
        show_best=False):
    """Run the evaluate function only on the contacted subset.
    Clearly there will be no TN or FN values. Is this correct?
    
    Returns
    -------
    reward - of the cost model at a threshold for these probas
    data - dataframe with cumulative cost and values sorted by probas
    """
    
    if type(cost_of_reviewing_query) not in (int, float):
        raise TypeError(f'cost_of_reviewing_query {cost_of_reviewing_query} not valid. Please input a number.')  
    elif cost_of_reviewing_query <0:
        raise InputError(f'cost_of_reviewing_query{cost_of_reviewing_query} less than 0. Please input 0 or a positive number.') 
  
    if type(cost_of_false_negative) not in (int, float):
        raise TypeError(f'cost_of_false_negative {cost_of_false_negative} not valid. Please input a number.')
    elif cost_of_false_negative <0:
        raise InputError(f'cost_of_false_negative{cost_of_false_negative} less than 0. Please input 0 or a positive number.') 
 
    if (perc_contacted <0 or perc_contacted>100):
        raise PercentageError(f"Percentage value {perc_contacted} outside of the limits. Please input a number between 0 and 100.") 
        
    if total_applicable_customers <=0:
        raise InputError(f"Number of total customers cannot be less than 0 or 0. Current number: {total_applicable_customers}")
       
    
    #print('exceptions run')
    VALUE_TRUE_POSITIVE = cost_of_reviewing_query
    VALUE_FALSE_POSITIVE = (-cost_of_reviewing_query)
    VALUE_TRUE_NEGATIVE = (-cost_of_reviewing_query)
    VALUE_FALSE_NEGATIVE = (-cost_of_false_negative)
    
    Current_spent = VALUE_FALSE_POSITIVE*total_applicable_customers
    Current_spent *= total_applicable_customers / len(y_true)
    print(f'Current spending in pounds: {locale.currency(abs(Current_spent))}')
    #print('Current_spent')
    
    k = int(len(y_true)*perc_contacted / 100)
    a = list(zip(y_pred,y_true))
    a.sort(key=lambda x: x[0], reverse=True)
    
    true_positives = sum(map(lambda x: x[1], a[:k]))
    false_positives = k - true_positives
    false_negatives = sum(map(lambda x: x[1], a[k:]))
    true_negatives = len(y_true) - k - false_negatives
        
    k = int(perc_contacted*len(y_pred) /100)
    a = list(zip(y_pred,y_true.values))
    a.sort(key=lambda x: x[0], reverse=True)
    data = pd.DataFrame(a, columns=['pred_proba', 'true'])
    
    TPFPTNFN = pd.DataFrame(columns=['TP','FP','TN','FN'])
    
#    for i in range(len(a)):
#        if (round(a[i][0])==1 and a[i][1]==1):
#            TPFPTNFN = TPFPTNFN.append({'TP': 1}, ignore_index=True)
#        elif (round(a[i][0])==0 and a[i][1]==0):
#                TPFPTNFN = TPFPTNFN.append({'TN': 1}, ignore_index=True)
#        elif (round(a[i][0])==1 and a[i][1]==0):
#                TPFPTNFN = TPFPTNFN.append({'FP': 1}, ignore_index=True)
#        elif (round(a[i][0])==0 and a[i][1]==1):
#                TPFPTNFN = TPFPTNFN.append({'FN': 1}, ignore_index=True)
#                
#    TPFPTNFN = TPFPTNFN.fillna(0, inplace=False)
#    
#    data['cost'] = TPFPTNFN.apply(lambda x: VALUE_TRUE_POSITIVE if x['TP'].astype(int) \
#                                     else (VALUE_FALSE_POSITIVE if x['FP'].astype(int) \
#                                      else (VALUE_TRUE_NEGATIVE if x['TN'].astype(int) \
#                                      else VALUE_FALSE_NEGATIVE)), axis=1)
#    
#    data['cost'] *= total_applicable_customers / len(y_true)
#    data['cumul_cost'] = data.cost.cumsum()
    
   # adj = list()
   # for i in range(len(y_true)):
   #     adj.append(data['cumul_cost'][i] - abs(Current_spent * (1-i/len(y_true))))
   # data['adj'] = adj 
    
    #data['saving'] = (1-data['adj']/Current_spent) * 100
#    data['saving'] = data['cumul_cost']/abs(Current_spent) * 100
#    
#    if perc_contacted == 0:
#        print("Gain by automating %d%% of the queries in Pounds: %s" % (perc_contacted, locale.currency(Current_spent)))
#    else:
#        perc_idx = int(perc_contacted*len(data)/100)
#        if perc_idx == 0:
#            perc_idx = 1
#        #print(f'perc_idx={perc_idx}')
#        print("Savings by automating %d%% of the queries in Pounds: %s" 
#              % (perc_contacted,locale.currency(data['cumul_cost'][perc_idx-1])))  
#         
#    if do_eval:
#        evaluate( None, data['true'], None, 
#             preds=[1] * k + [0] * (len(data) - k), 
#             probas=list(data['pred_proba'].values))
    
    return a, TPFPTNFN

    
plot_cost_benefit(data, Current_spent, show_best)

NameError: name 'show_best' is not defined

In [5]:
#import train and test data sets:
X_train_T = pd.read_csv('X_test.csv')
X_train_T = X_train_T.drop(["Unnamed: 0"],axis = 1)

X_test_T = pd.read_csv('X_test.csv')
X_test_T = X_test_T.drop(["Unnamed: 0"],axis = 1)
y_test_T = pd.read_csv('y_test.csv', header=None)[1]

Textmodel = pickle.load(open('textBased_model.sav', 'rb'))

In [6]:
probas = Textmodel.predict_proba(X_test_T)[:,1]

#Choose percentage to automate:
PERCENT_TO_AUTOMATE = 10

#State cost of reviewing queries and cost of missing an escalation:
cost_of_reviewing_query = 5 #This part can be modified
cost_of_false_negative = 10 # This part can be modified

#Total number of queries reviewed:
total_no_queries = 45000

In [15]:
a, TPFPTNFN = cost_benefit_analysis(y_test_T,
                                             probas,
                                             perc_contacted=PERCENT_TO_AUTOMATE,
                                             total_applicable_customers=total_no_queries,
                                             cost_of_reviewing_query = cost_of_reviewing_query,
                                             cost_of_false_negative = cost_of_false_negative,
                                             plot_utility=True, 
                                             show_best=True
                                              )

Current spending in pounds: $1574895.01


In [16]:
a

[(0.99025035, 1),
 (0.9883464, 1),
 (0.9834827, 1),
 (0.9832303, 1),
 (0.9821045, 1),
 (0.9808044, 1),
 (0.9779809, 1),
 (0.9772006, 1),
 (0.97664785, 1),
 (0.9762442, 1),
 (0.9754817, 1),
 (0.9753421, 1),
 (0.97461206, 1),
 (0.97434026, 0),
 (0.9739726, 1),
 (0.9735337, 1),
 (0.9731486, 1),
 (0.972354, 1),
 (0.97153014, 1),
 (0.9715271, 1),
 (0.9714967, 1),
 (0.9713524, 1),
 (0.9711657, 1),
 (0.97107047, 1),
 (0.97064555, 1),
 (0.9705978, 1),
 (0.97048, 1),
 (0.97019875, 1),
 (0.96901065, 1),
 (0.96900284, 1),
 (0.9689397, 1),
 (0.9685763, 1),
 (0.9684867, 1),
 (0.96829796, 1),
 (0.9674925, 1),
 (0.96738464, 0),
 (0.96722186, 0),
 (0.9671321, 1),
 (0.9660538, 1),
 (0.96567875, 1),
 (0.96554416, 1),
 (0.96531415, 1),
 (0.9648354, 1),
 (0.9640025, 1),
 (0.96395963, 1),
 (0.96351486, 1),
 (0.9631811, 1),
 (0.96275735, 1),
 (0.9626815, 1),
 (0.9626364, 1),
 (0.96259874, 1),
 (0.9621095, 1),
 (0.9619512, 0),
 (0.9613831, 1),
 (0.96128273, 0),
 (0.9612009, 1),
 (0.9608133, 1),
 (0.9603269, 

In [26]:
for i in range(len(a)):
    if (round(a[i][0])==1 and a[i][1]==1).any():
        TPFPTNFN = TPFPTNFN.append({'TP': 1}, ignore_index=True)
    elif (round(a[i][0])==0 and a[i][1]==0).any():
            TPFPTNFN = TPFPTNFN.append({'TN': 1}, ignore_index=True)
    elif (round(a[i][0])==1 and a[i][1]==0).any():
            TPFPTNFN = TPFPTNFN.append({'FP': 1}, ignore_index=True)
    elif (round(a[i][0])==0 and a[i][1]==1).any():
            TPFPTNFN = TPFPTNFN.append({'FN': 1}, ignore_index=True)

In [39]:
y_test_T.loc[:]

0       0
1       1
2       1
3       1
4       0
5       0
6       1
7       1
8       0
9       0
10      0
11      0
12      0
13      1
14      1
15      0
16      0
17      0
18      0
19      1
20      0
21      1
22      1
23      0
24      0
25      1
26      0
27      1
28      1
29      0
       ..
6399    0
6400    0
6401    1
6402    0
6403    0
6404    0
6405    1
6406    1
6407    0
6408    0
6409    0
6410    1
6411    1
6412    0
6413    0
6414    0
6415    0
6416    0
6417    0
6418    0
6419    1
6420    0
6421    1
6422    0
6423    1
6424    0
6425    1
6426    0
6427    0
6428    1
Name: 1, Length: 6429, dtype: int64

In [23]:
(round(a[1][0])==1 and a[1][1]==1).all()

True

In [None]:

lis = [(x_axis[i], utility[i]) for i in range(len(x_axis))]
lis = lis[1:]
best = max(lis, key=lambda x: x[1])

utility_number = [0] + list(data.cumul_cost)
lis2 = [(x_axis[i], utility_number[i]) for i in range(len(x_axis))]
lis2 = lis2[1:]

best2 = max(lis2, key=lambda x: x[1])
max_save_rate = round(best[1],2)
max_aut_queries = best[0]
amount_saved = best[1]*(-Current_spent)/100
print(f'Maximum saving rate is {round(best[1],2)}% if {best[0]:.2f}% of queries is automated')
print(f'This accounts for {locale.currency(best[1]*(-Current_spent)/100)} saved')