In [1]:
from Func import *

pd.set_option('display.max_columns', None)

In [2]:
df_prod = pd.read_csv(os.path.abspath('Datasets/Prod.csv'))
df_post = pd.read_csv(os.path.abspath('Datasets/Post.csv'))
    
df_post['click time'] = pd.to_datetime(df_post['click time'], format="%Y-%m-%d %H:%M:%S")
df_post['conversion time'] = pd.to_datetime(df_post['conversion time'], format="%Y-%m-%d %H:%M:%S")
df_post['sale time'] = pd.to_datetime(df_post['sale time'], format="%Y-%m-%d %H:%M:%S")



df_cl = pd.read_csv(os.path.abspath('Datasets/Clicks.csv'))
df_cl['click time'] = pd.to_datetime(df_cl['click time'], format="%Y-%m-%d %H:%M:%S")


df_prod_usual = df_prod[df_prod['loan type'] == 'loan-usual'].copy()

df_post_usual = df_post[df_post['loan type'] == 'loan-usual'].copy()
df_post_usual['tmp_date'] = df_post_usual['click time'].dt.date
subid5(df = df_post_usual, col = "MFI page rank")

0

# Basic recommender system of microfinance institutions: Charts

In [3]:
def save_or_show(p, file_name, SAVE=True, SHOW=False):
    if SAVE == True:
        output_file(filename= "Charts/" + file_name + ".html", title=file_name)
        save(p)
    if SHOW == True:
        show(p)

## Figure 1

Status and loan type distributions.

In [4]:
p1 = status_and_loan_type(df_post)
save_or_show(p = p1, file_name = "status_and_loan_type", SAVE=True, SHOW=False)

## Figure 2

Dependence of the number of applications on MFI page rank. Left: cumulative number of applications increases with time (for eight highest ranks); right: the number of applications at the last available timestamp.

In [5]:
p2_1 = number_of_req_for_rank_over_time(df_post[df_post['click time'] < pd.to_datetime('2023-12-13')], width=600, height=400)
p2_2 = number_of_requests(df_post[df_post['click time'] < pd.to_datetime('2023-12-13')], width=400, height=400)
p2_1p2_2 = gridplot([[p2_1,p2_2]], merge_tools=False)
save_or_show(p = p2_1p2_2, file_name = "page_rank_and_numer_of_requests", SAVE=True, SHOW=False)

## Figure 3

Percentage of applications by MFI id and MFI page rank. At different times, and on different web pages, the rank of a fixed MFI may vary, which is the reason why there is no one-to-one correspondence between MFI id and MFI page rank.

In [6]:
shares_of_MFIs_for_various_ranks(df_post, SHOW=False, SAVE=True)

## Figure 4

Visualization of clients by their number of applications (left). MFI id chosen by clients who submitted more than 1 application (right). Clients are ordered by decreasing number of applications. Color corresponds to the number of different MFIs for a client. It is easy to see that clients apply to various MFIs.

In [7]:
p4_1 = client_difference_size_app(df_post, width=400, height=400)
p4_2 = client_difference_app(df_post, width=700, height=400)
p4_1p4_2 = gridplot([[p4_1,p4_2]])

save_or_show(p = p4_1p4_2, file_name = "clients_difference", SAVE=True, SHOW=False)

the share of clients with no more than 1 applications is equal to 0.8238931864866046
the share of clients with no more than 5 applications is equal to 0.9955712891336335
the share of the former among the latter is equal 0.8275582024905251


## Figure 5

Income, number of sales and EPC for 4 indicative MFIs:

In [8]:
p5 = income_4_mfis(df_cl)
save_or_show(p = p5, file_name = "income_4_mfis", SAVE=True, SHOW=False)

## Figure 7

Box plots and whiskers for conversion period (left) and processing period (right) inhours for 6 indicative MFIs. \
Box: lower $Q_{0.03}$, mid $Q_{0.5}$, upper $Q_{0.97}$; 
whiskers: upper $Q_{0.97} + 1.5(Q_{0.97} - Q_{0.03})$, lower $Q_{0.03} - 1.5(Q_{0.97} - Q_{0.03})$.

In [9]:
mfi_size = df_post.groupby('MFI id').size().reset_index(name='size').sort_values('size')
MFIs = mfi_size['MFI id'].unique()
MFIs = list(MFIs[-20:])
MFIs = MFIs[-2:] + ['MFI 56','MFI 18','MFI 64','MFI 29']

df_post_top_mfi = df_post[df_post['MFI id'].isin(MFIs)][['MFI id', 'click time', 'conversion time', 'sale time']].copy()
df_post_top_mfi = df_post_top_mfi[df_post_top_mfi['click time'] >= '2023-01-01'].copy()
df_post_top_mfi['time to conversion'] = (df_post_top_mfi['conversion time'] - df_post_top_mfi['click time']).dt.total_seconds() / 3600
df_post_top_mfi['time of processing'] = (df_post_top_mfi['sale time'] - df_post_top_mfi['conversion time']).dt.total_seconds() / 3600
df_post_top_mfi = df_post_top_mfi[['MFI id', 'time to conversion', 'time of processing']].sort_values('MFI id').copy()
tmp = df_post.groupby('MFI id').size().reset_index(name='size').sort_values('size')
dict_size = dict(zip(tmp['MFI id'],tmp['size']))
df_post_top_mfi['size'] =  df_post_top_mfi['MFI id'].map(dict_size)
df_post_top_mfi = df_post_top_mfi.sort_values('size').copy()



p7_1 = box_plot(df = df_post_top_mfi,
             score = 'time to conversion',
            q=0.03)
p7_2 = box_plot(df = df_post_top_mfi,
             score = 'time of processing',
            q=0.03)

p7_1p7_2 = gridplot([[p7_1,p7_2]], merge_tools=False)

save_or_show(p = p7_1p7_2, file_name = "time_6_mfi", SAVE=True, SHOW=False)

## Figure 8

Number of applications (dots) and shares (bars) for status for 6 indicative MFIs:

In [10]:
p8 = status_6_mfis(df_post)
save_or_show(p = p8, file_name = "status_6_mfi", SAVE=True, SHOW=False)

## Figure 9

In [11]:
df_status_os = df_post[['MFI id', 'status', 'device browser']].copy()

df_status_os['device browser'] = df_status_os['device browser'].apply(lambda x: 1 if x in ['Mobile Safari',
                                                                                          'Chrome Mobile iOS',
                                                                                          'Safari',
                                                                                           'Firefox Mobile iOS'] else 0)

status_dict = dict(zip(['sale', 'lead', 'rejected'],
                   [1,0,0]))
df_status_os['status'] = df_status_os['status'].map(status_dict)

tmp = df_status_os.groupby('MFI id').size().reset_index(name='size')
mfi_size = dict(zip(tmp['MFI id'], tmp['size']))
df_status_os['size'] = df_status_os['MFI id'].map(mfi_size)
df_status_os.sort_values('size', inplace=True)
df_status_os = df_status_os[df_status_os['size'] > 1].copy()


df_gamma = df_status_os.groupby('MFI id', sort=False).apply(lambda x: gamma(x)[0]).reset_index(name='top')
df_gamma['left'] = df_status_os.groupby('MFI id', sort=False).apply(lambda x: gamma(x)[1]).reset_index(name='left')['left']
df_gamma['right'] = df_status_os.groupby('MFI id', sort=False).apply(lambda x: gamma(x)[2]).reset_index(name='right')['right']


Сolligation coefficient (bar) with 99.5% confidence interval (whiskers) for binary vari- ables: “sale” or not, “iOS” or not for each MFI. MFIs are ordered according to the increasing number of applications. Drops down correspond to lack of approved applications with “iOS” from respective MFIs. Stabilization of the coefficient shows that the dependence between the binary variables is the same among MFIs. Unfortunately, there is no intersection among CIs: a maximum of lower bounds (0.0919) is greater than minimum of upper bounds (0.0605).

In [12]:
p9 = corr(df=df_gamma, width=830, height=264, ylable = "Gamma")

save_or_show(p = p9, file_name = "gamma_correlation_ios", SAVE=True, SHOW=False)

left_max =  0.09194373570520825
right_min =  0.06046909552394621


## Figure 10

Reviews for 6 indicative MFIs:

In [13]:
p10 = rating_6_mfi(df_post, df_prod)
save_or_show(p = p10, file_name = "rating_6_mfi", SAVE=True, SHOW=False)

## Figure 11, 12

In [14]:
df_post_usual['status 2 int'] = df_post_usual['status 2'].map({'sale':1,
                               'rejected':0})
tmp = df_post_usual.groupby('MFI id').apply(lambda df: df['status 2 int'].sum() / len(df)).reset_index(name='LAR')
dict_do = dict(zip(tmp['MFI id'], tmp['LAR']))

tmp2 = df_post_usual.groupby('MFI id').apply(lambda df: len(df)).reset_index(name='Num. of app.')
dict_do2 = dict(zip(tmp2['MFI id'], tmp2['Num. of app.']))

df_prod_usual['LAR'] = df_prod_usual['MFI id'].map(dict_do)
df_prod_usual['Num. of app.'] = df_prod_usual['MFI id'].map(dict_do2)

df = df_prod_usual[['MFI id', 'Norm. LAR',
                              'LAR', 'Num. of app.',
                              'Norm. average user rating', 'number of reviews',
                              'average user rating']].copy()


df.set_index('MFI id', inplace=True)

mfi_size = df_post.groupby('MFI id').size().reset_index(name='size').sort_values('size')
MFIs = mfi_size['MFI id'].unique()
MFIs = list(MFIs[-20:])
MFIs = MFIs[-2:] + ['MFI 56','MFI 18','MFI 64','MFI 29']

df1 = df[['Num. of app.', 'LAR', 'Norm. LAR']].sort_values('Num. of app.')
df1['MFI id'] = [i for i in range(len(df))]
df1 = df1[['MFI id', 'Num. of app.', 'LAR', 'Norm. LAR']]

df2 = df[['number of reviews', 'average user rating', 'Norm. average user rating']].sort_values('number of reviews')
df2['MFI id'] = [i for i in range(len(df))]
df2 = df2[['MFI id', 'number of reviews', 'average user rating', 'Norm. average user rating']]

Parallel plot for loan approval rate before and after normalization. MFIs whose rank by LAR decreased after normalization are shown in red, MFIs whose rank by LAR increased — in green. Indicative 6 MFIs are highlighted in bold. A large number of applications for MFI 20, MFI 87 and MFI 56 (12380, 10704, 8302) lowered their rank due to weak approval rates (0.038, 0.052, 0.048). On the other hand, a large number of applications for MFI 18 (5075) significantly strengthened its rather high rank by the approval rate (0.23).

In [15]:
color1 = pd.Series((rankdata(df1['Norm. LAR']) >= rankdata(df1['LAR']))*1)
p11 = parallel_plot(df=df1, color=color1, palette=[dict_pallets[3], dict_pallets[4]], MFIs = MFIs, width=900, height=400)
save_or_show(p = p11, file_name = "normlar", SAVE=True, SHOW=False)

Parallel plot for average user rating before and after normalization. MFIs whose rank by rating decreased after normalization are shown in red and MFIs whose rank by rating increased — in green. Indicative 6 MFIs are highlighted in bold. MFI 64 with a significant number of reviews (339) and a positive rating (4) strengthened its position. MFI 20 with a poor rating (3.3) and a moderate number of reviews (143) lowered its position. We can also notice that top 4 MFIs with a great initial rating (≈ 5) lowered its rank as they have very few reviews, but still remained in the top positions.

In [17]:
color2 = (rankdata(df2['Norm. average user rating']) >= rankdata(df2['average user rating']))*1
p12 = parallel_plot(df=df2, color=color2, palette=[dict_pallets[3], dict_pallets[4]], MFIs = MFIs, width=900, height=400)
save_or_show(p = p12, file_name = "normrate", SAVE=True, SHOW=False)

## Figure 15, 16, 19

In [18]:
# find the names of all ranks that contain the letters == '1a', '2a', '1a-amp' ...
ranks_AB_test = df_cl['MFI page rank'].value_counts()
names_ranks_AB_test = [i  for i in ranks_AB_test.index if i.isupper() or i.islower() ]

# list of key names 
pages_key_words_AB_test = ['pensioneram', 'bez-snils', 'kruglosutochnye', '100000-rublej',  'qiwi']

# function that checks if the page belongs to the AB test, returns true/false 
def check_AB_test_page(page):
    if type(page) == str:
        return np.max([page.find(name) for name in pages_key_words_AB_test ]) != -1
    elif math.isnan(page):
        return page
# true/false column of page belonging to the list of pages
df_cl['AB_test_page'] = df_cl['page id'].apply(check_AB_test_page)

# function that checks if the page belongs to AB-testing, returns the keyword 
def check_AB_test_page2(page):
    if type(page) == str:
        if (np.max([page.find(name) for name in pages_key_words_AB_test ]) != -1) == True:
            idx = np.max([page.find(name) for name in pages_key_words_AB_test ])
            return page[idx]
        else:
            return np.nan
    elif math.isnan(page):
        return page
# true/false column of page belonging to the list of pages
df_cl['AB_test_page'] = df_cl['page id'].apply(check_AB_test_page)
df_cl['AB_test_page_key'] = df_cl['page id'].apply(check_AB_test_page2)


#Applications in clicks that have matmodel ranks appearing, but are not in the list of intended pages

df_cl['AB_test_by_rank'] = df_cl['MFI page rank' ].isin(names_ranks_AB_test)

df_cl_ = df_cl.copy()
df_cl_ = df_cl_[~df_cl_['income'].isna()].copy()
df_cl_ = df_cl_[~df_cl_['AB_test_page_key'].isna()].copy()


df_cl_['Sales'] = df_cl_['income'].apply(lambda x: int(x > 0))

dict_map_rename1 = dict(zip(['b', '1', 'p', 'q', 'k'],['1', '2', '3', '4', '5']))
df_cl['AB_test_page_key'] = df_cl['AB_test_page_key'].map(dict_map_rename1)
df_cl_['AB_test_page_key'] = df_cl_['AB_test_page_key'].map(dict_map_rename1)

In [19]:
df_conv = df_post.copy()

ranks_AB_test = df_conv['MFI page rank ab'].value_counts()
names_ranks_AB_test = [i  for i in ranks_AB_test.index if i.isupper() or i.islower() ]


pages_key_words_AB_test = ['pensioneram', 'bez-snils', 'kruglosutochnye', '100000-rublej',  'qiwi']



df_conv['AB_test_page'] = df_conv['page id'].apply(check_AB_test_page)


df_conv['AB_test_page'] = df_conv['page id'].apply(check_AB_test_page)
df_conv['AB_test_page_key'] = df_conv['page id'].apply(check_AB_test_page2)


df_conv['AB_test_by_rank'] = df_conv['MFI page rank ab' ].isin(names_ranks_AB_test)


df_conv_ = df_conv.copy()
df_conv_ = df_conv_[~df_conv_['income'].isna()].copy()
df_conv_ = df_conv_[~df_conv_['AB_test_page_key'].isna()].copy()

df_conv_['Sales'] = df_conv_['status'].apply(lambda x: int(x == 'sale'))

# dict_map_rename1 = dict(zip(['b', '1', 'p', 'q', 'k'],['1', '2', '3', '4', '5']))
df_conv_['AB_test_page_key'] = df_conv_['AB_test_page_key'].map(dict_map_rename1)

Figure 15: The number of applications (left) and the number sales (right) for 5 page id ranked by AW’s algorithm (solid line) and by the baseline algorithm (dotted line).

In [20]:
p15 = AB_comp_page(df=df_cl[~df_cl['AB_test_page_key'].isna()], width=800, height=600, mode = 'cl')
p16 = AB_comp_page(df=df_conv_[df_conv_['click time'] <= df_cl['click time'].max()], width=800, height=600, mode = 'conv')

p15p16 = gridplot([[p15,p16]], width=830, height=350, merge_tools=False)
save_or_show(p = p15p16, file_name = "AB_comp_page_clicks_conv", SAVE=True, SHOW=False)

In [21]:
# Weekly
df_cl_['date_click'] = df_cl_['click time'].dt.date
df_conv_['date_click'] = df_conv_['click time'].dt.date


start = pd.to_datetime('2023-05-15').date()
end = pd.to_datetime('2023-08-31').date()
df_cl_ = df_cl_[(df_cl_['date_click'] >= start) & (df_cl_['date_click'] <= end)].copy()
df_conv_ = df_conv_[(df_conv_['date_click'] >= start) & (df_conv_['date_click'] <= end)].copy()


df_cl_['date_week'] = df_cl_['click time'].dt.isocalendar().week.apply(lambda x: int(x))
df_conv_['date_week'] = df_conv_['click time'].dt.isocalendar().week.apply(lambda x: int(x))
df_cl_['date_week'] = pd.to_datetime(df_cl_['date_week'], unit = 'W', origin = pd.Timestamp('2023-01-01'))
df_conv_['date_week'] = pd.to_datetime(df_conv_['date_week'], unit = 'W', origin = pd.Timestamp('2023-01-01'))


df_conv_page_sale_day = df_conv_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_week'])['Sales'].sum().reset_index(name='Sum_sales')

df_conv_page_sale_day['for_map'] = df_conv_page_sale_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_sale_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_conv_page_sale_day['date_week'].apply(lambda x: str(x))


df_cl_page_day = df_cl_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_week']).size().reset_index(name='Size_cl')


df_cl_page_day['for_map'] = df_cl_page_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_cl_page_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_cl_page_day['date_week'].apply(lambda x: str(x))


df_conv_page_income_day = df_conv_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_week'])['income'].sum().reset_index(name='Sum_income')

df_conv_page_income_day['for_map'] = df_conv_page_income_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_income_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_conv_page_income_day['date_week'].apply(lambda x: str(x))


df_cl_page_day['by_rank_page_key'] = df_cl_page_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_cl_page_day['AB_test_page_key'].apply(lambda x: str(x))
df_conv_page_sale_day['by_rank_page_key'] = df_conv_page_sale_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_sale_day['AB_test_page_key'].apply(lambda x: str(x))

dict_map = dict(zip(df_cl_page_day['for_map'], df_cl_page_day['Size_cl']))

df_conv_page_sale_day['Size_cl'] = df_conv_page_sale_day['for_map'].map(dict_map)

dict_map_inc = dict(zip(df_conv_page_income_day['for_map'], df_conv_page_income_day['Sum_income']))

df_conv_page_sale_day['Sum_income'] = df_conv_page_sale_day['for_map'].map(dict_map_inc)

df_conv_page_sale_day['Share by click'] = df_conv_page_sale_day['Sum_sales'] / df_conv_page_sale_day['Size_cl']
df_conv_page_sale_day['Income by click'] = df_conv_page_sale_day['Sum_income'] / df_conv_page_sale_day['Size_cl']

df4 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_week'])['Sum_sales'].sum().reset_index(name='Sum_sales')
df4['key'] = df4['AB_test_by_rank'].apply(lambda x: str(x)) + df4['date_week'].apply(lambda x: str(x))
dict_map1 = dict(zip(df4['key'], df4['Sum_sales']))

df5 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_week'])['Size_cl'].sum().reset_index(name='Size_cl')
df5['key'] = df5['AB_test_by_rank'].apply(lambda x: str(x)) + df5['date_week'].apply(lambda x: str(x))
dict_map2 = dict(zip(df5['key'], df5['Size_cl']))

df6 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_week'])['Sum_income'].sum().reset_index(name='Sum_income')
df6['key'] = df6['AB_test_by_rank'].apply(lambda x: str(x)) + df6['date_week'].apply(lambda x: str(x))

df6['Sum_sales'] = df6['key'].map(dict_map1)
df6['Size_cl'] = df6['key'].map(dict_map2)
df6['Share by click'] = df6['Sum_sales'] / df6['Size_cl']
df6['Income by click'] = df6['Sum_income'] / df6['Size_cl']

Figure 16: Weekly income and share by click for the AW and baseline algorithms. It can be seen that the baseline algorithm lines are on average higher than the AW lines.

In [22]:
p16 = comp_graph_AB(df = df6, width=830, height=264)
save_or_show(p = p16, file_name = "weekly_results_with_comparison_method_AB_5page", SAVE=True, SHOW=False)

In [23]:
# Daily
df_cl_['date_click'] = df_cl_['click time'].dt.date
df_conv_['date_click'] = df_conv_['click time'].dt.date


start = pd.to_datetime('2023-05-15').date()
end = pd.to_datetime('2023-08-31').date()
df_cl_ = df_cl_[(df_cl_['date_click'] >= start) & (df_cl_['date_click'] <= end)].copy()
df_conv_ = df_conv_[(df_conv_['date_click'] >= start) & (df_conv_['date_click'] <= end)].copy()


df_conv_page_sale_day = df_conv_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_click'])['Sales'].sum().reset_index(name='Sum_sales')

df_conv_page_sale_day['for_map'] = df_conv_page_sale_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_sale_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_conv_page_sale_day['date_click'].apply(lambda x: str(x))


df_cl_page_day = df_cl_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_click']).size().reset_index(name='Size_cl')


df_cl_page_day['for_map'] = df_cl_page_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_cl_page_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_cl_page_day['date_click'].apply(lambda x: str(x))


df_conv_page_income_day = df_conv_.groupby(['AB_test_by_rank','AB_test_page_key', 'date_click'])['income'].sum().reset_index(name='Sum_income')

df_conv_page_income_day['for_map'] = df_conv_page_income_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_income_day['AB_test_page_key'].apply(lambda x: str(x)) +  df_conv_page_income_day['date_click'].apply(lambda x: str(x))


df_cl_page_day['by_rank_page_key'] = df_cl_page_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_cl_page_day['AB_test_page_key'].apply(lambda x: str(x))
df_conv_page_sale_day['by_rank_page_key'] = df_conv_page_sale_day['AB_test_by_rank'].apply(lambda x: str(x)) +  df_conv_page_sale_day['AB_test_page_key'].apply(lambda x: str(x))

dict_map = dict(zip(df_cl_page_day['for_map'], df_cl_page_day['Size_cl']))

df_conv_page_sale_day['Size_cl'] = df_conv_page_sale_day['for_map'].map(dict_map)

dict_map_inc = dict(zip(df_conv_page_income_day['for_map'], df_conv_page_income_day['Sum_income']))

df_conv_page_sale_day['Sum_income'] = df_conv_page_sale_day['for_map'].map(dict_map_inc)

df_conv_page_sale_day['Share by click'] = df_conv_page_sale_day['Sum_sales'] / df_conv_page_sale_day['Size_cl']
df_conv_page_sale_day['Income by click'] = df_conv_page_sale_day['Sum_income'] / df_conv_page_sale_day['Size_cl']

df4 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_click'])['Sum_sales'].sum().reset_index(name='Sum_sales')
df4['key'] = df4['AB_test_by_rank'].apply(lambda x: str(x)) + df4['date_click'].apply(lambda x: str(x))
dict_map1 = dict(zip(df4['key'], df4['Sum_sales']))

df5 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_click'])['Size_cl'].sum().reset_index(name='Size_cl')
df5['key'] = df5['AB_test_by_rank'].apply(lambda x: str(x)) + df5['date_click'].apply(lambda x: str(x))
dict_map2 = dict(zip(df5['key'], df5['Size_cl']))

df6 = df_conv_page_sale_day.groupby(['AB_test_by_rank','date_click'])['Sum_income'].sum().reset_index(name='Sum_income')
df6['key'] = df6['AB_test_by_rank'].apply(lambda x: str(x)) + df6['date_click'].apply(lambda x: str(x))
df6['Sum_sales'] = df6['key'].map(dict_map1)
df6['Size_cl'] = df6['key'].map(dict_map2)
df6['Share by click'] = df6['Sum_sales'] / df6['Size_cl']
df6['Income by click'] = df6['Sum_income'] / df6['Size_cl']

Figure 19: Daily income and share by click for the AW and baseline algorithms. It can be seen that the baseline algorithm lines are on average higher than the AW lines.

In [24]:
p19 = comp_graph_AB_by_day(df = df6, width=830, height=264)
save_or_show(p = p19, file_name = "daily_results_with_comparison_method_AB_5page", SAVE=True, SHOW=False)

## Figure 17

In [2]:
# # If there are no Pospos and Posneg dataframes need to be run:

# df_Train_matr = pd.read_csv(os.path.abspath('Datasets/Train_matr.csv'), low_memory=False)
# df_Train_matr['click time'] = pd.to_datetime(df_Train_matr['click time'])
# df_Train_matr['tmp_date'] = df_Train_matr['click time'].dt.date
# expir_id_ = expir_id(df = df_Train_matr)


# PosposPosneg = create_matrix(df_Train_matr)

# PosposPosneg[0].to_csv(os.path.abspath('Datasets/Pospos.csv'))
# PosposPosneg[1].to_csv(os.path.abspath('Datasets/Posneg.csv'))

100%|███████████████████████████████████| 107568/107568 [24:03<00:00, 74.50it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8797.95it/s]
100%|███████████████████████████████████| 107568/107568 [23:36<00:00, 75.92it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8802.61it/s]
100%|███████████████████████████████████| 107568/107568 [23:48<00:00, 75.32it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8600.99it/s]
100%|███████████████████████████████████| 107568/107568 [24:10<00:00, 74.16it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8657.21it/s]
100%|███████████████████████████████████| 107568/107568 [24:09<00:00, 74.20it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8575.48it/s]
100%|███████████████████████████████████| 107568/107568 [25:08<00:00, 71.32it/s]
100%|█████████████████████████████████| 107568/107568 [00:12<00:00, 8598.10it/s]


In [40]:
# # %%capture
# # If there are no usual_drow dataframes need to be run:

# df_cl_usual = df_cl[df_cl['loan type'] == 'loan-usual'].copy()
# df_cl_usual['tmp_date'] = df_cl_usual['click time'].dt.date

# pospos = pd.read_csv(os.path.abspath('Datasets/Pospos.csv'))
# posneg = pd.read_csv(os.path.abspath('Datasets/Posneg.csv'))
# pospos['Unnamed: 0'] = pospos['Unnamed: 0'].apply(lambda x: str(x))
# posneg['Unnamed: 0'] = posneg['Unnamed: 0'].apply(lambda x: str(x))
# pospos.set_index('Unnamed: 0', inplace=True)
# posneg.set_index('Unnamed: 0', inplace=True)
# matrices = [pospos, posneg]

# df_Train_matr = pd.read_csv(os.path.abspath('Datasets/Train_matr.csv'), low_memory=False)
# expir_id_ = expir_id(df = df_Train_matr)

# list_of_list = [
# #                 ['Norm. LAR', 'Norm. average user rating',
# #                  'service period (q90)', 'fairness',
# #                  'EPC'],
                
# #                ['Norm. LAR', 'Norm. average user rating',
# #                 'service period (q90)', 'EPC'],
                
# #                ['Norm. LAR', 'Norm. average user rating', 
# #                 'fairness', 'EPC'],
                
#                ['Norm. LAR', 'Norm. average user rating',
#                 'EPC'],
                
#                # ['Norm. LAR', 'Norm. average user rating',
#                #  'service period (q90)', 'fairness']
# ]

# list_df_com_usual = []
# for chars in tqdm(list_of_list):
#     list_df_com_usual.append(test_CoM(df_post = df_post_usual,
#                         matrices = matrices,
#                         df_prod = df_prod_usual,
#                         df_clicks = df_cl_usual,
#                         characteristics = chars,
#                         dict_expir_id = expir_id_,
#                         start_date = pd.to_datetime('2023-06-13', format='%Y-%m-%d'),
#                         len_train = pd.to_timedelta('30d'),
#                         len_test = pd.to_timedelta('1d')
#                         ))

# list_df_com_usual_drow = []
# for df in list_df_com_usual:
#     list_df = []
#     for col in df.columns[1:]:
#         list_df.append(add_col(df_=df[['date', col]].copy(),
#                                col=col,
#                               card = 'usual'))

#     stat_day = pd.concat(list_df)
#     list_df_com_usual_drow.append(stat_day)

# usual_drow = tmp_drow_com(list_df=list_df_com_usual_drow)
# usual_drow.to_csv(os.path.abspath('Datasets/usual_drow.csv'), index=False)

Daily income and share for historical data and baseline algorithm. The result of comparing the baseline algorithm with historical data, using the comparison method, shows that the loan approval rate almost doubled and the average revenue increased.

In [25]:
usual_drow = pd.read_csv(os.path.abspath('Datasets/usual_drow.csv'))
usual_drow['date'] = pd.to_datetime(usual_drow['date'])


p17 = comp_graph(df = usual_drow, width=830, height=264)
save_or_show(p = p17, file_name = "results_with_comparison_method", SAVE=True, SHOW=False)

# Figure 18

The shares of applications processed within the consideration time plus payment time are shown in green, while the shares of applications whose processing time exceeds the declared time are shown in red.

In [26]:
p18 = Fig18(df_prod = df_prod_usual, width=830, height=264)

save_or_show(p = p18, file_name = "process_application_on_time", SAVE=True, SHOW=False)