In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
from matplotlib.ticker import FormatStrFormatter
from matplotlib.lines import Line2D

from sklearn import metrics

In [4]:
dict_dtypes={'LocationNo': str}
df = pd.read_csv('BuyTitleOfferEval.csv', header=0, dtype=dict_dtypes, index_col=0)

In [5]:
#df.set_index('CatalogID', drop=True, inplace=True)

In [6]:
df['actual_TotalBuyOffers_r40'] = df['actual_BuyOfferAmt_r40'] * df['count_ItemsPriced']
df['pred_TotalSuggestedOffers_r40'] = df['pred_SuggestedOffer_r40'] * df['count_ItemsPriced']
df['error_AAD_r40'] = df['pred_AAD_r40'] - df['avg_CatalogAccDays_TrashPenalty_r40']
df['RSE_AAD_r40'] = np.sqrt((df['pred_AAD_r40'] - df['avg_CatalogAccDays_TrashPenalty_r40'])**2)

KeyError: 'actual_BuyOfferAmt_r40'

In [None]:
df.info()

In [None]:
def create_CombinedOfferDF(df):
    df_All = pd.DataFrame(df['CatalogID'].unique(), columns =['CatalogID'])#Get all unique catalogIDs in df of all offers, set as index
    list_OfferCols = [c for c in df.columns if c not in ['CatalogID','LocationNo', 'CatalogBinding', 'count_ItemsPriced', 'count_ItemsSold']]
    filt_ChainOffer = df['LocationNo'] == 'Chain'
    #Get all location offers
    df_All = df_All.merge(df[~filt_ChainOffer], how='left', on='CatalogID') 
    #Set indices to CatalogID to update all location CatalogIDs with missing offers with chain offers
    df_All.set_index('CatalogID', drop=True, inplace=True)
    df.set_index('CatalogID', drop=True, inplace=True)
    #Set up filters post-reindexing
    filt_NoLocOffer = (df_All['pred_SuggestedOffer_r40'].isna())
    filt_ChainOffer = df['LocationNo'] == 'Chain'
    #Create column to track whether or not an offer is based on location or chain data, defaulting to "Location"
    df_All['OfferType'] = 'Location'
    #Fill in null location offers with chain offers
    df_All.loc[filt_NoLocOffer, list_OfferCols] = df[filt_ChainOffer][list_OfferCols]
    #Where null location offers were filled, change OfferType to "Chain"
    df_All.loc[filt_NoLocOffer, 'OfferType'] = 'Chain'
    #Reset indices
    df_All.reset_index(inplace=True)
    df.reset_index(inplace=True)
    #Drop all columns that still have no associated offers
    df_All.drop(df_All[df_All['pred_SuggestedOffer_r40'].isna()].index, inplace=True)
    return df_All

In [None]:
filt_Chain = df['LocationNo'] == 'Chain'
df_All = create_CombinedOfferDF(df)
df_Chain = df[filt_Chain]#.reset_index(drop=True)
df_Loc = df[~filt_Chain]#.reset_index(drop=True)

In [None]:
print(df_Chain['count_ItemsPriced'].sum())
print(df_Loc[df_Loc['pred_SuggestedOffer_r40'].isna()]['count_ItemsPriced'].sum())
print(df_Loc[~df_Loc['pred_SuggestedOffer_r40'].isna()]['count_ItemsPriced'].sum())

In [None]:
def create_PredCompDF(df, minSampleNum, binding='', **kwargs):
    list_Bindings = df['CatalogBinding'].unique()
    if (binding in list_Bindings) == False:
        binding = list_Bindings
    else: 
        binding = [binding]
    filt_QtyNPlus= (df['count_ItemsPriced'] >= minSampleNum) & (df['CatalogBinding'].isin(binding))
    df_PredCompUngrouped = df[filt_QtyNPlus].reset_index(drop=True)
    df_PredComp = df_PredCompUngrouped.groupby(['pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']).sum()[['count_ItemsPriced', 'count_ItemsSold']].reset_index()
    return df_PredComp

def calc_CatAcc(df, gradeThreshold=0.3, **kwargs):
    filt_EqualGrades = df['pred_BuyOfferPct_r40'] == df['actual_BuyOfferPct_r40']
    filt_GradeThreshold = df['pred_BuyOfferPct_r40'] >= gradeThreshold
    df_AccByGrade = (df[filt_EqualGrades & filt_GradeThreshold]['count_ItemsPriced'].sum() /
                     df[filt_GradeThreshold]['count_ItemsPriced'].sum())
    return df_AccByGrade

def calc_CatAccByGrade(df):
    filt_EqualGrades = df['pred_BuyOfferPct_r40'] == df['actual_BuyOfferPct_r40']
    df_AccByGrade = pd.DataFrame((df[filt_EqualGrades].groupby('pred_BuyOfferPct_r40').sum()['count_ItemsPriced'] /
                     df.groupby('pred_BuyOfferPct_r40').sum()['count_ItemsPriced']))
    df_AccByGrade= df_AccByGrade.merge(df.groupby('pred_BuyOfferPct_r40').sum()['count_ItemsPriced'], on='pred_BuyOfferPct_r40').reset_index()
    df_AccByGrade.rename(columns={'pred_BuyOfferPct_r40': 'Suggested Offer Grade', 'count_ItemsPriced_x': 'Pct Accuracy', 'count_ItemsPriced_y': 'Total Qty'}, inplace=True)
    return df_AccByGrade

def calc_CatPredPctsByGrade(df, gradeThreshold=0, **kwargs):
    df_PredPctsByGrade = df.merge(df.groupby('pred_BuyOfferPct_r40').sum()['count_ItemsPriced'], on='pred_BuyOfferPct_r40')
    df_PredPctsByGrade['pct_ActualGrades'] = df_PredPctsByGrade['count_ItemsPriced_x'] / df_PredPctsByGrade['count_ItemsPriced_y']
    df_PredPctsByGrade.rename(columns={'pred_BuyOfferPct_r40': 'Suggested Offer Grade', 
                                       'actual_BuyOfferPct_r40': 'Actual Grade',
                                       'count_ItemsPriced_x': 'Qty Actual Grades',
                                       'pct_ActualGrades': 'Pct Actual Grades'}, inplace=True)
    filt_GradeThreshold = df_PredPctsByGrade['Suggested Offer Grade'] >= gradeThreshold
    return df_PredPctsByGrade[filt_GradeThreshold][['Suggested Offer Grade', 'Actual Grade', 'Qty Actual Grades', 'Pct Actual Grades']]

def calc_MRSE(pred, targ):
    mrse = np.sqrt(np.mean([((t - p)**2) for (p, t) in zip(pred, targ)]))
    return mrse

In [None]:
df_Chain.dropna(inplace=True)

In [None]:
#calc_MRSE(df_Chain['avg_CatalogAccDays_NR'], df_Chain['pred_AAD_r40'])

In [None]:
calc_MRSE(df_Chain['avg_CatalogAccDays_TrashPenalty_r40'], df_Chain['pred_AAD_r40'])

In [None]:
calc_MRSE(df_Chain['avg_CatalogAccDays_TrashPenalty_r40'], df_Chain['pred_AAD_r40'])

In [None]:
filt_Grade = df_Chain['pred_BuyOfferPct_r40'] > 0
filt_QtyThreshold = df_Chain['count_ItemsPriced'] >= 5
filt_Binding = df_Chain['CatalogBinding'] == 'General'

In [None]:
calc_MRSE(df_Chain[filt_Binding & filt_QtyThreshold]['avg_CatalogAccDays_TrashPenalty_r40'], 
          df_Chain[filt_Binding & filt_QtyThreshold]['pred_AAD_r40'])

In [None]:
calc_MRSE(df_Chain[filt_Grade & filt_Binding & filt_QtyThreshold]['avg_CatalogAccDays_TrashPenalty_r40'], 
          df_Chain[filt_Grade & filt_Binding & filt_QtyThreshold]['pred_AAD_r40'])

In [None]:
df_ChainPredComp = create_PredCompDF(df_Chain[filt_Binding], 5)
print(calc_CatAcc(df_ChainPredComp, gradeThreshold=0))
calc_CatAccByGrade(df_ChainPredComp)

In [None]:
calc_CatPredPctsByGrade(df_ChainPredComp, gradeThreshold=0)

In [None]:
filt_actual = df_Chain['actual_BuyOfferPct_r40'] == 0.4
filt_pred = df_Chain['pred_BuyOfferPct_r40'] == 0.4
filt_accurate = df_Chain['pred_BuyOfferPct_r40'] == df_Chain['actual_BuyOfferPct_r40']

df_Chain[filt_accurate & filt_pred]['count_ItemsPriced'].sum() / df_Chain[filt_pred]['count_ItemsPriced'].sum()

In [None]:
df_LocPredComp = create_PredCompDF(df_Loc, 1)
print(calc_CatAcc(df_LocPredComp, gradeThreshold=0))
calc_CatAccByGrade(df_LocPredComp)

In [None]:
def create_ConfusionMatrix_Normalized(df):
    list_CalcCols = ['count_ItemsPriced', 'pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']
    df_calc = df[list_CalcCols].copy()
    df_cm = (df_calc.groupby(['pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']).sum()['count_ItemsPriced']/
              df_calc.groupby(['pred_BuyOfferPct_r40']).sum()['count_ItemsPriced'])
    df_cm = df_cm.unstack('actual_BuyOfferPct_r40').fillna(0).stack('actual_BuyOfferPct_r40').sort_index(ascending=False) #Add zeros where nulls result in no index value, sort the index
    idx_order = df_cm.index.levels[0].sort_values(ascending=False)
    array_cm = np.array([np.array(df_cm.loc[i]) for i in idx_order])
    return array_cm, idx_order

def create_ConfusionMatrix(df):
    list_CalcCols = ['count_ItemsPriced', 'pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']
    df_calc = df[list_CalcCols].copy()
    df_cm = (df_calc.groupby(['pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']).sum()['count_ItemsPriced']/
              df_calc.sum()['count_ItemsPriced'])
    df_cm = df_cm.unstack('actual_BuyOfferPct_r40').fillna(0).stack('actual_BuyOfferPct_r40').sort_index(ascending=False)
    idx_order = df_cm.index.levels[0].sort_values(ascending=False)
    array_cm = np.array([np.array(df_cm.loc[i]) for i in idx_order])
    print('Total accuracy = {:0.2%}'.format(np.trace(array_cm)))
    return array_cm, idx_order

def plot_ConfusionMatrix(cm, max_C = 0, **kwargs):
    fig, ax = plt.subplots(figsize=(8,6))
    if max_C == 0:
        max_C = max([i for j in cm for i in j])
    sns.heatmap(cm, cmap = 'bone', vmin=0, vmax=max_C, annot=True, fmt='.1%', ax=ax)
    ax.set_xlabel('Actual Offer %', fontsize=14)
    ax.set_ylabel('Predicted Offer %', fontsize=14)
    ax.set_xticklabels([ '40%', '30%', '20%', '5%', '0%'])
    ax.set_yticklabels(['40%', '30%', '20%', '5%', '0%'])
    cbar = ax.collections[0].colorbar
    cbar.set_ticks([0, 0.2 * max_C, 0.4 * max_C, 0.6 * max_C, 0.8 * max_C, max_C])
    cbar.set_ticklabels(['{:0.0%}'.format(p) for p in [0, 0.2 * max_C, 0.4 * max_C, 0.6 * max_C, 0.8 * max_C, max_C]])
    #plt.savefig('./r40_ConfMatrix.png')
    plt.show()
    return 

In [None]:
filt_QtyThreshold = df_Chain['count_ItemsPriced'] >= 5
filt_Binding = df_Chain['CatalogBinding'] == 'General'

In [None]:
df_ChainPredComp = create_PredCompDF(df_Chain[filt_Binding], 5)
print(calc_CatAcc(df_ChainPredComp, gradeThreshold=0))
calc_CatAccByGrade(df_ChainPredComp)

In [None]:
filt_ChainQtyThreshold = df_Chain['count_ItemsPriced'] >= 5
filt_ChainBinding = df_Chain['CatalogBinding'] == 'General'
cm, pcts = create_ConfusionMatrix_Normalized(df_Chain[filt_ChainBinding & filt_ChainQtyThreshold])
plot_ConfusionMatrix(cm, max_C = 1)

In [None]:
filt_ChainQtyThreshold = df_Chain['count_ItemsPriced'] >= 5
filt_ChainBinding = df_Chain['CatalogBinding'] == 'General'
cm, pcts = create_ConfusionMatrix(df_Chain[filt_ChainBinding & filt_ChainQtyThreshold])
plot_ConfusionMatrix(cm)

In [None]:
filt_LocNAPreds = df_Loc['pred_SuggestedOffer_r40'].isna()
filt_LocQtyThreshold = df_Loc['count_ItemsPriced'] >= 5
filt_LocBinding = df_Loc['CatalogBinding'] == 'General'
cm, pcts = create_ConfusionMatrix_Normalized(df_Loc[~filt_LocNAPreds &filt_LocBinding & filt_LocQtyThreshold])
plot_ConfusionMatrix(cm)

In [None]:
filt_AllQtyThreshold = df_All['count_ItemsPriced'] >= 5
filt_AllBinding = df_All['CatalogBinding'] == 'General'
filt_AllChainOffer = df_All['OfferType'] == 'Chain'
cm, pcts = create_ConfusionMatrix(df_All[filt_AllBinding & filt_AllQtyThreshold])
plot_ConfusionMatrix(cm)

In [None]:
filt_AllQtyThreshold = df_All['count_ItemsPriced'] >= 5
filt_AllBinding = df_All['CatalogBinding'] == 'General'
cm = create_ConfusionMatrix_Normalized(df_All[filt_AllBinding & filt_AllQtyThreshold])
plot_ConfusionMatrix(cm)

In [None]:
9 + 15.3 + 14.5+4.1+1.5

In [None]:
filt_AllQtyThreshold = df_All['count_ItemsPriced'] >= 6
filt_AllBinding = df_All['CatalogBinding'] == 'General'
cm = create_ConfusionMatrix_Normalized(df_All[filt_AllBinding & filt_AllQtyThreshold])
plot_ConfusionMatrix(cm)

In [None]:
print((df_Chain[filt_Binding & filt_QtyThreshold]['actual_BuyOfferAmt_r40'] * 
       df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum())

print((df_Chain[filt_Binding & filt_QtyThreshold]['pred_SuggestedOffer_r40'] * 
       df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum())

print((df_Chain[filt_Binding & filt_QtyThreshold]['actual_BuyOfferAmt_r40'] * 
       df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum() -
     (df_Chain[filt_Binding & filt_QtyThreshold]['pred_SuggestedOffer_r40'] * 
       df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum())

print((df_Chain[filt_Binding & filt_QtyThreshold]['pred_SuggestedOffer_r40'] * 
       df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum()/
     (df_Chain[filt_Binding & filt_QtyThreshold]['actual_BuyOfferAmt_r40'] * 
      df_Chain[filt_Binding & filt_QtyThreshold]['count_ItemsPriced']).sum())

In [None]:
df_Chain[filt_Binding].groupby('pred_BuyOfferPct_r40')['actual_TotalBuyOffers_r40', 'pred_TotalSuggestedOffers_r40'].sum()

In [None]:
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(8,6))
sns.regplot(df_Chain[filt_Binding & filt_QtyThreshold]['pred_AAD_r40'],
            df_Chain[filt_Binding & filt_QtyThreshold]['error_AAD_r40'],
            scatter_kws=dict(alpha=0.1),
            line_kws=dict(color='grey'))
ax.set_xlim([0,5000])
ax.set_ylim([-2500,2500])
ax.set_xlabel('Predicted Total Day Accumulation w/ Trash Penalty')
ax.set_ylabel('Predicted minus Actual Day Accumulation')
#plt.savefig('./r40_ContinuousErrorRegPlot.png')
plt.show()

In [None]:
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(8,6))
sns.regplot(df_Chain[filt_Binding & filt_QtyThreshold]['pred_AAD_r40'],
            df_Chain[filt_Binding & filt_QtyThreshold]['RSE_AAD_r40'],
            scatter_kws=dict(alpha=0.1),
            line_kws=dict(color='grey'))
ax.set_xlim([0,5000])
ax.set_ylim([0,5000])
ax.set_xlabel('Predicted Title Day Accumulation w/ Trash Penalty')
ax.set_ylabel('Prediction RMSE')
#plt.savefig('./r40_ContinuousRSERegPlot.png')
plt.show()

In [None]:
df_plt = df_Chain[filt_Binding][['pred_AAD_r40', 'RSE_AAD_r40']]
df_plt['pred_AAD_r40'] = round(df_plt['pred_AAD_r40'], 0)
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(df_plt.groupby('pred_AAD_r40')['pred_AAD_r40'].mean(),
                df_plt.groupby('pred_AAD_r40')['RSE_AAD_r40'].mean(),
                ax = ax)
ax.set_xlim([0, 500])
ax.set_ylim([0, 400])
ax.set_xlabel('Predicted Day Accumulation')
ax.set_ylabel('Root Mean Squared Error')
#plt.savefig('./r40_ContinuousRMSE_500Scale.png')
plt.show()

In [None]:
df_plt = df_Chain[filt_Binding][['pred_AAD_r40', 'RSE_AAD_r40']]
df_plt['pred_AAD_r40'] = round(df_plt['pred_AAD_r40'], 0)
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(df_plt.groupby('pred_AAD_r40')['pred_AAD_r40'].mean(),
                df_plt.groupby('pred_AAD_r40')['RSE_AAD_r40'].mean(),
                ax = ax)
ax.set_xlim([0, 500])
ax.set_ylim([0, 500])
ax.set_xlabel('Predicted Day Accumulation')
ax.set_ylabel('Root Mean Squared Error')
#plt.savefig('./r40_ContinuousRMSE_200Scale.png')
plt.show()

In [None]:
df_Chain.groupby('pred_BuyOfferPct_r40')['RSE_AAD_r40'].mean()

In [None]:
df_Chain.groupby('pred_BuyOfferPct_r40')['RSE_AAD_r40'].median()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(x='pred_BuyOfferPct_r40', y='RSE_AAD_r40', order=[0.4, 0.3, 0.2, 0.05, 0.0], data=df_Chain[filt_Binding], ax=ax, boxprops=dict(alpha=.5))
sns.violinplot(x='pred_BuyOfferPct_r40', y='RSE_AAD_r40', order=[0.4, 0.3, 0.2, 0.05, 0.0], data=df_Chain[filt_Binding], ax=ax)
ax.set_ylim(0,2000)
ax.set_xlabel('Buy Offer Percetages')
ax.set_xticklabels([ '40%', '30%', '20%', '5%', '0%'])
ax.set_ylabel('Variance')
ax.yaxis.set_major_locator(mtick.MultipleLocator(100))
#plt.savefig('./r40_ErrorsByGrade_VioBox.png')
ax.set_title('Title Day Accumulation Prediction Error by Grade', fontsize=16)
plt.show()

In [None]:
df_Chain.info()

In [None]:

(df_Chain.groupby(['pred_BuyOfferPct_r40', 'actual_BuyOfferPct_r40']).sum()['count_ItemsPriced']/
 df_Chain.groupby(['pred_BuyOfferPct_r40']).sum()['count_ItemsPriced']).sort_values(ascending=False)

In [None]:
df_Chain.info()