In [1]:
# Python libraries
import numpy as np
import pandas as pd
# import datetime

# 3rd party libraries
import ness

# Internal imports
from data_manager import to_numeric, create_agg_var, two_date_cols_diff, days_from_today, last_row, create_unpaid_cols, partitioned_row_number

In [2]:
pd.options.display.max_columns = 200
# pd.options.display.max_rows = 10000

In [3]:
# Set the ness parameters
ness_parameters = {
    'bucket':"data.postpay.io",
    'key':"api",
    'profile':"default"
}
# Instantiate the data lake
dl = ness.dl(**ness_parameters)
# Read/Sync all the tables from the data lake
dl.sync()
# Create data frames from the tables
cart_df = dl.read("cart")
orders_df = dl.read("orders")
customers_df = dl.read("customers")
addresses_df = dl.read("addresses")
instalment_plans_df = dl.read("instalment-plans")
instalments_df = dl.read("instalments")
transactions_df = dl.read("transactions")
refunds_df = dl.read("refunds")

delete: ../../../.ness/api/addresses.parquet/part-00000-ecfe270d-b265-425e-a03b-aed8bfd29d1a-c000.gz.parquet
delete: ../../../.ness/api/cart.parquet/part-00000-a61140c4-beea-4cf0-9af3-51d7a0ab01a6-c000.gz.parquet
delete: ../../../.ness/api/customers.parquet/part-00000-f503ca3b-ae66-4c92-bb76-27e49c8f4482-c000.gz.parquet
delete: ../../../.ness/api/instalment-plans.parquet/part-00000-f6b9bebc-289d-486b-a16f-54cebd09800c-c000.gz.parquet
delete: ../../../.ness/api/instalments.parquet/part-00000-15edb69a-31cc-447a-a897-5094724da9af-c000.gz.parquet
delete: ../../../.ness/api/orders.parquet/part-00000-6a8e4057-1c04-453f-b4b4-e56fce9a44e9-c000.gz.parquet
delete: ../../../.ness/api/refunds.parquet/part-00000-7274cb10-7ea0-40f7-b3be-ceb050147b7b-c000.gz.parquet
delete: ../../../.ness/api/transactions.parquet/part-00000-dee8500f-8c79-45dc-81b3-41e4ac43e063-c000.gz.parquet
download: s3://data.postpay.io/api/refunds.parquet/part-00000-dead6b1f-821f-45ef-945f-9250e3f61d3c-c000.gz.parquet to ../../..

In [524]:
instalment_plans_df[instalment_plans_df["customer_id"] == 6171.0].sort_values("created", ascending=True)

Unnamed: 0,created,payment_method_fingerprint,downpayment_amount,billing_address_id,device_fingerprint,total_downpayment,checkout_completed,payment_method_brand,plan,payment_method_expires,shipping_address_id,total_amount,merchant_id,customer_id,currency,gateway_name,id_number,checkout_verified,cancelled,payment_method_country,shipping_amount,reference,merchant_name,completed,instalment_plan_id,payment_interval,customer_date_joined,status_changed,order_id,checkout_type,customer_email,status,num_instalments,shipping_id,customer_blacklisted,payment_method_type,date_of_birth,downpayment_refunded_amount,user_agent,phone,ip_address,transaction_cost_rate,transaction_cost_amount
16388,2021-03-22 10:36:06.772810,a2635d30ccbb492d89701942cc0fcc7c,1365.0,14978.0,3d94bb11a33f9bed45f760af786aa610,1365.0,2021-03-22 10:34:02.686038,visa,funded,2025-12-31,14978.0,4095.0,214,6171.0,AED,checkout,784199743509277,2021-03-22 10:34:02.718331,NaT,AE,0.0,FLAEHDE0441946-6058728863c99,Footlocker,NaT,10067,,2019-10-25 13:59:35,2021-03-22 10:36:15.534068,19823,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like M...,971562290350,2001:8f8:146d:585f:498f:95f7:1e9a:2c19,0.026,0.5
16383,2021-03-22 20:42:19.962977,a2635d30ccbb492d89701942cc0fcc7c,151.67,14978.0,3d94bb11a33f9bed45f760af786aa610,151.67,2021-03-22 20:42:14.779556,visa,funded,2025-12-31,14978.0,455.0,214,6171.0,AED,checkout,784199743509277,2021-03-22 20:42:14.811702,NaT,AE,0.0,FLAEHDE0442096-6059011550da8,Footlocker,2021-04-08 14:53:38.028005,10112,,2019-10-25 13:59:35,2021-03-22 20:42:29.675127,19901,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like M...,971562290350,2001:8f8:146d:585f:5c7d:364:71bb:5a79,0.026,0.5
16391,2021-03-23 21:35:42.896228,a2635d30ccbb492d89701942cc0fcc7c,86.5,14978.0,3d94bb11a33f9bed45f760af786aa610,86.5,2021-03-23 21:35:38.802611,visa,funded,2025-12-31,14978.0,259.5,214,6171.0,AED,checkout,784199743509277,2021-03-23 21:35:38.837613,NaT,AE,0.0,FLAEHDE0443176-605a5f1a7a4e0,Footlocker,2021-04-07 14:24:11.837859,10167,,2019-10-25 13:59:35,2021-03-23 21:35:52.354349,20000,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like M...,971562290350,2001:8f8:146d:585f:d461:c422:c4de:9d7e,0.026,0.5
16387,2021-04-04 08:47:51.141882,a2635d30ccbb492d89701942cc0fcc7c,630.67,14978.0,1e778238bba02ac1c3a592172f30d361,630.67,2021-04-04 08:47:39.586676,visa,funded,2025-12-31,14978.0,1892.0,214,6171.0,AED,checkout,784199743509277,2021-04-04 08:47:39.633586,NaT,AE,0.0,FLAEHDA0009757-60697d1757899,Footlocker,NaT,11093,,2019-10-25 13:59:35,2021-04-04 08:48:00.141568,21584,default,saberw764@gmail.com,captured,3,12079.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like...,971562290350,2001:8f8:146d:585f:94c2:9a7f:136a:3d70,0.026,0.5
16381,2021-04-06 13:58:14.349328,a2635d30ccbb492d89701942cc0fcc7c,175.0,16627.0,5812eace2f527d54b9c93b9c61201aba,175.0,2021-04-06 13:58:11.659669,visa,funded,2025-12-31,16627.0,525.0,214,6171.0,AED,checkout,784199743509277,2021-04-06 13:58:11.692182,NaT,AE,0.0,FLAEHDE0452503-606c68e08ae4a,Footlocker,2021-04-08 15:29:11.991728,11306,,2021-04-06 13:26:14,2021-04-06 13:58:23.917764,21976,default,walid_raja_1997@hotmail.fr,captured,3,12279.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5
16382,2021-04-06 14:35:38.083231,a2635d30ccbb492d89701942cc0fcc7c,55.0,16627.0,5812eace2f527d54b9c93b9c61201aba,55.0,2021-04-06 14:35:34.376762,visa,funded,2025-12-31,16627.0,165.0,214,6171.0,AED,checkout,784199743509277,2021-04-06 14:35:34.411370,NaT,AE,35.0,FLAEHDE0452509-606c719e3858b,Footlocker,2021-04-07 14:04:51.649501,11308,,2021-04-06 13:26:14,2021-04-06 14:35:46.871550,21988,default,walid_raja_1997@hotmail.fr,captured,3,12285.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5
16389,2021-04-07 14:16:54.562613,a2635d30ccbb492d89701942cc0fcc7c,899.0,14978.0,5812eace2f527d54b9c93b9c61201aba,899.0,2021-04-07 14:16:52.477572,visa,funded,2025-12-31,14978.0,2697.0,214,6171.0,AED,checkout,784199743509277,2021-04-07 14:16:52.534885,NaT,AE,0.0,FLAEHDE0453220-606dbec26b053,Footlocker,NaT,11390,,2019-10-25 13:59:35,2021-04-07 14:17:03.728823,22189,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5
16390,2021-04-07 14:19:36.529124,a2635d30ccbb492d89701942cc0fcc7c,340.0,14978.0,5812eace2f527d54b9c93b9c61201aba,340.0,2021-04-07 14:19:34.249048,visa,funded,2025-12-31,14978.0,1020.0,214,6171.0,AED,checkout,784199743509277,2021-04-07 14:19:34.282146,NaT,AE,0.0,FLAEHDE0453361-606dbf6428386,Footlocker,NaT,11391,,2019-10-25 13:59:35,2021-04-07 14:19:45.244588,22190,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5
16384,2021-04-07 15:03:27.197242,a2635d30ccbb492d89701942cc0fcc7c,1513.0,14978.0,5812eace2f527d54b9c93b9c61201aba,1513.0,2021-04-07 15:03:24.241251,visa,funded,2025-12-31,14978.0,4539.0,214,6171.0,AED,checkout,784199743509277,2021-04-07 15:03:24.278310,NaT,AE,0.0,FLAEHDE0453403-606dc9a02c806,Footlocker,NaT,11401,,2019-10-25 13:59:35,2021-04-07 15:03:36.986788,22206,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5
16385,2021-04-07 15:16:41.071783,a2635d30ccbb492d89701942cc0fcc7c,755.47,14978.0,5812eace2f527d54b9c93b9c61201aba,755.47,2021-04-07 15:16:38.442922,visa,funded,2025-12-31,14978.0,2266.4,214,6171.0,AED,checkout,784199743509277,2021-04-07 15:16:38.486538,NaT,AE,0.0,FLAEHDE0453433-606dccc109ac0,Footlocker,NaT,11404,,2019-10-25 13:59:35,2021-04-07 15:16:49.785005,22210,default,saberw764@gmail.com,captured,3,10960.0,2021-04-08 05:23:50,debit,1997-11-07,0.0,Mozilla/5.0 (Linux; Android 9; JKM-LX1) AppleW...,971562290350,2001:8f8:146d:585f:bdbc:9c2b:717a:b73,0.026,0.5


In [525]:
instalments_df[instalments_df["instalment_plan_id"] == 10067]

Unnamed: 0,order,refunded_amount,penalty_fee,amount,instalment_plan_id,scheduled,completed,status,total
30114,0,0.0,55.0,1365.0,10067,2021-04-22,NaT,unpaid,1420.0
30115,1,0.0,55.0,1365.0,10067,2021-05-22,NaT,unpaid,1420.0


In [4]:
# Convert the anount columns to numeric
to_numeric(df=instalments_df, columns_list=['refunded_amount', 'penalty_fee', 'amount', 'total'])
to_numeric(df=instalment_plans_df, columns_list=['downpayment_amount','total_downpayment', 'total_amount', 'shipping_amount', 'downpayment_refunded_amount'])

In [5]:
# Filter customers_df to get only the relevant variables
filtered_customers_df = customers_df[['customer_id', 'created', 'date_of_birth']].rename(columns={'created':'customer_first_joined'})

In [6]:
filtered_customers_df

Unnamed: 0,customer_id,customer_first_joined,date_of_birth
0,43344,2021-11-16 15:13:50.281012,
1,20107,2021-09-02 07:01:35.973814,1982-03-25
2,9261,2021-05-01 11:10:01.384310,1988-07-29
3,25770,2021-09-29 18:23:05.691539,
4,11589,2021-05-28 14:25:58.579672,1992-12-23
...,...,...,...
49555,3508,2021-01-28 21:15:43.857063,1966-04-12
49556,18541,2021-08-21 15:37:57.684409,1992-02-09
49557,6265,2021-03-24 15:23:43.471573,1990-08-30
49558,13785,2021-06-30 06:32:02.464975,1986-09-26


In [7]:
# Initial filters
pi3_bool = instalment_plans_df['num_instalments'] == 3
ae_bool = instalment_plans_df['payment_method_country'] == 'AE'

In [8]:
# Create instalment_plans_df for ONLY "pi3" and "AE"
pi3_ae_instalment_plans_df = instalment_plans_df[pi3_bool & ae_bool]

In [9]:
# Create ID's table for ONLY "pi3" and "AE"
pi3_ae_instalment_plans_id_df = instalment_plans_df[pi3_bool & ae_bool][['customer_id', 'instalment_plan_id', 'order_id']]

# Customer df

In [406]:
# Filter customers_df to get only the relevant variables
filtered_customers_df = customers_df[['customer_id', 'created', 'date_of_birth']].rename(columns={'created':'customer_first_joined'})

In [407]:
filtered_customers_df

Unnamed: 0,customer_id,customer_first_joined,date_of_birth
0,43344,2021-11-16 15:13:50.281012,
1,20107,2021-09-02 07:01:35.973814,1982-03-25
2,9261,2021-05-01 11:10:01.384310,1988-07-29
3,25770,2021-09-29 18:23:05.691539,
4,11589,2021-05-28 14:25:58.579672,1992-12-23
...,...,...,...
49555,3508,2021-01-28 21:15:43.857063,1966-04-12
49556,18541,2021-08-21 15:37:57.684409,1992-02-09
49557,6265,2021-03-24 15:23:43.471573,1990-08-30
49558,13785,2021-06-30 06:32:02.464975,1986-09-26


# AOV

In [77]:
# Create temp AOV table
aov_tmp_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created', 'total_amount']]

In [78]:
aov_tmp_df["row_number"] = partitioned_row_number(
    df=aov_tmp_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aov_tmp_df["row_number"] = partitioned_row_number(


In [79]:
aov_tmp_df_sort = aov_tmp_df.sort_values(["customer_id", "row_number"])

In [80]:
aov_tmp_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,total_amount,row_number
41640,47958,64.0,2021-10-31 10:02:57.280467,91.00,1.0
17343,48388,64.0,2021-11-01 11:44:17.953913,23.00,2.0
17342,48414,64.0,2021-11-01 12:24:36.978093,46.41,3.0
13292,6419,68.0,2021-02-02 18:16:01.287941,624.75,1.0
13295,10058,68.0,2021-03-22 08:08:12.476035,1049.00,2.0
...,...,...,...,...,...
22542,61330,49895.0,2021-11-25 22:17:26.492825,542.00,1.0
49909,61348,49915.0,2021-11-25 23:01:43.810158,828.00,1.0
38247,61350,49917.0,2021-11-25 23:04:20.693576,599.00,1.0
8307,61354,49925.0,2021-11-25 23:23:49.199569,250.00,1.0


In [81]:
avg_order_value_list = []
for i in aov_tmp_df_sort['customer_id'].unique():
    for d in aov_tmp_df_sort[aov_tmp_df_sort['customer_id'] == i]['row_number']:
        # AOV
        current_aov = aov_tmp_df_sort[(aov_tmp_df_sort['customer_id'] == i) & (aov_tmp_df_sort['row_number'] < d)]['total_amount'].mean()
        avg_order_value_list.append(current_aov)

In [82]:
aov_tmp_df_sort["avg_order_value"] = avg_order_value_list

In [83]:
aov_tmp_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,total_amount,row_number,avg_order_value
41640,47958,64.0,2021-10-31 10:02:57.280467,91.00,1.0,
17343,48388,64.0,2021-11-01 11:44:17.953913,23.00,2.0,91.00
17342,48414,64.0,2021-11-01 12:24:36.978093,46.41,3.0,57.00
13292,6419,68.0,2021-02-02 18:16:01.287941,624.75,1.0,
13295,10058,68.0,2021-03-22 08:08:12.476035,1049.00,2.0,624.75
...,...,...,...,...,...,...
22542,61330,49895.0,2021-11-25 22:17:26.492825,542.00,1.0,
49909,61348,49915.0,2021-11-25 23:01:43.810158,828.00,1.0,
38247,61350,49917.0,2021-11-25 23:04:20.693576,599.00,1.0,
8307,61354,49925.0,2021-11-25 23:23:49.199569,250.00,1.0,


# Fees

In [17]:
# Create instalments_df for ONLY "pi3" and "AE"
pi3_ae_instalments_df = pi3_ae_instalment_plans_id_df.merge(instalments_df, how='left', on='instalment_plan_id')

In [19]:
# Create sum of fees per order
fees_df = create_agg_var(
    df=pi3_ae_instalments_df,
    groupby_col='instalment_plan_id',
    orig_cols=['penalty_fee'],
    new_col_names=['sum_fees_per_order'],
    agg_fnc='sum',
    )

cid_fees_df = fees_df.merge(pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']], how='left', on='instalment_plan_id')

In [21]:
cid_fees_df["row_number"] = partitioned_row_number(
    df=cid_fees_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

In [22]:
cid_fees_df_sort = cid_fees_df.sort_values(["customer_id", "row_number"])

In [56]:
fees_list = []
# cid_fees_df_sort['customer_id'].unique()
# Loop the customer_id
for i in cid_fees_df_sort['customer_id'].unique():
    # Loop the row_number per customer_id (which is the 'instalment_plan_id', but ordered)
    for d in cid_fees_df_sort[cid_fees_df_sort['customer_id'] == i]['row_number']:
        # Define the conditions for row i nd d
        row_i = (cid_fees_df_sort['customer_id'] == i)
        row_i_d = (cid_fees_df_sort['customer_id'] == i) & (cid_fees_df_sort['row_number'] == d)
        # Get the date of the specific row
        current_day = cid_fees_df_sort[row_i_d]['created'].iloc[0]
        # Keep the previous orders
        hist_df = cid_fees_df_sort[row_i & (cid_fees_df_sort['row_number'] < d)]
        # Add current_day to the hist_df
        hist_df['current_day'] = current_day
        # Calculate the days diff between the 'current_day' and 'created' for each row in the hist_df
        hist_df['diff_days'] = (hist_df['current_day'] - hist_df['created']).dt.days
        # Calculate the min date of the hist_df
        min_day = hist_df['created'].min()
        # print(hist_df)

        nr_days_list = [30, 90, 180, 365]
        tmp_list = []
        for y in nr_days_list:
            hist_df_y = hist_df[hist_df['diff_days'] < y]
            current_aov = hist_df_y[row_i & (hist_df_y['row_number'] < d)]['sum_fees_per_order'].mean()
            tmp_list.append(current_aov)

        fees_list.append(tmp_list)

  current_aov = hist_df_y[row_i & (hist_df_y['row_number'] < d)]['sum_fees_per_order'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hist_df['current_day'] = current_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hist_df['diff_days'] = (hist_df['current_day'] - hist_df['created']).dt.days


In [58]:
fees_df = pd.DataFrame(fees_list, columns =['avg_fees_per_order_30d', 'avg_fees_per_order_90d', 'avg_fees_per_order_180d', 'avg_fees_per_order_365d'], dtype = float)

Unnamed: 0,avg_fees_per_order_30d,avg_fees_per_order_90d,avg_fees_per_order_180d,avg_fees_per_order_365d
0,,,,
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,,,,
4,,0.0,0.0,0.0
...,...,...,...,...
35074,,,,
35075,,,,
35076,,,,
35077,,,,


In [75]:
cid_avg_fees_df_sort = pd.concat([cid_fees_df_sort.reset_index(drop=True), fees_df], axis=1)

In [76]:
cid_avg_fees_df_sort

Unnamed: 0,instalment_plan_id,sum_fees_per_order,customer_id,created,row_number,avg_fees_per_order_30d,avg_fees_per_order_90d,avg_fees_per_order_180d,avg_fees_per_order_365d
0,47958,0.0,64.0,2021-10-31 10:02:57.280467,1.0,,,,
1,48388,0.0,64.0,2021-11-01 11:44:17.953913,2.0,0.0,0.0,0.0,0.0
2,48414,0.0,64.0,2021-11-01 12:24:36.978093,3.0,0.0,0.0,0.0,0.0
3,6419,0.0,68.0,2021-02-02 18:16:01.287941,1.0,,,,
4,10058,0.0,68.0,2021-03-22 08:08:12.476035,2.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
35074,61330,0.0,49895.0,2021-11-25 22:17:26.492825,1.0,,,,
35075,61348,0.0,49915.0,2021-11-25 23:01:43.810158,1.0,,,,
35076,61350,0.0,49917.0,2021-11-25 23:04:20.693576,1.0,,,,
35077,61354,0.0,49925.0,2021-11-25 23:23:49.199569,1.0,,,,


In [312]:
cid_avg_fees_df_sort.shape

(35079, 9)

# Nr of merchant

In [93]:
# Create temp nr_merch table
nr_merch_tmp_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created', 'merchant_name']]

In [95]:
nr_merch_tmp_df["row_number"] = partitioned_row_number(
    df=nr_merch_tmp_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nr_merch_tmp_df["row_number"] = partitioned_row_number(


In [97]:
nr_merch_tmp_df_sort = nr_merch_tmp_df.sort_values(["customer_id", "row_number"])

In [99]:
nr_merchants_list = []
for i in nr_merch_tmp_df_sort['customer_id'].unique():
    for d in nr_merch_tmp_df_sort[nr_merch_tmp_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = nr_merch_tmp_df_sort[(nr_merch_tmp_df_sort['customer_id'] == i) & (nr_merch_tmp_df_sort['row_number'] < d)]['merchant_name'].count()
        nr_merchants_list.append(current_aov)

In [101]:
nr_merch_tmp_df_sort["count_merchants_per_customer"] = nr_merchants_list

In [102]:
nr_merch_tmp_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,merchant_name,row_number,count_merchants_per_customer
41640,47958,64.0,2021-10-31 10:02:57.280467,Clarins,1.0,0
17343,48388,64.0,2021-11-01 11:44:17.953913,Toys R Us,2.0,1
17342,48414,64.0,2021-11-01 12:24:36.978093,Mumzworld,3.0,2
13292,6419,68.0,2021-02-02 18:16:01.287941,The Entertainer,1.0,0
13295,10058,68.0,2021-03-22 08:08:12.476035,Footlocker,2.0,1
...,...,...,...,...,...,...
22542,61330,49895.0,2021-11-25 22:17:26.492825,Squat Wolf,1.0,0
49909,61348,49915.0,2021-11-25 23:01:43.810158,American Eagle,1.0,0
38247,61350,49917.0,2021-11-25 23:04:20.693576,The Entertainer,1.0,0
8307,61354,49925.0,2021-11-25 23:23:49.199569,Bath and Body Works,1.0,0


# Number of open orders

In [109]:
cid_status_df = pi3_ae_instalment_plans_id_df[['instalment_plan_id', 'customer_id']].merge(instalments_df[['instalment_plan_id', 'status', 'total', 'scheduled']], how='left', on='instalment_plan_id')
cid_status_due_df = cid_status_df[cid_status_df['status'] == 'due']

In [122]:
due_max_schedule_df = create_agg_var(
    df=cid_status_due_df,
    groupby_col='instalment_plan_id',
    orig_cols=['scheduled'],
    new_col_names=['max_scheduled'],
    agg_fnc='max'
    )

In [125]:
due_max_schedule_inst_df = due_max_schedule_df.merge(cid_status_due_df[['instalment_plan_id', 'scheduled', 'customer_id']], how='left', left_on=['instalment_plan_id', 'max_scheduled'], right_on=['instalment_plan_id', 'scheduled'])

In [129]:
due_max_schedule_inst_df["row_number"] = partitioned_row_number(
    df=due_max_schedule_inst_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["scheduled"]
)

In [133]:
due_max_schedule_inst_df_sort = due_max_schedule_inst_df.sort_values(["customer_id", "row_number"])

In [136]:
nr_open_orders_list = []
for i in due_max_schedule_inst_df_sort['customer_id'].unique():
    for d in due_max_schedule_inst_df_sort[due_max_schedule_inst_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = due_max_schedule_inst_df_sort[(due_max_schedule_inst_df_sort['customer_id'] == i) & (due_max_schedule_inst_df_sort['row_number'] < d)]['customer_id'].count()
        nr_open_orders_list.append(current_aov)

In [137]:
due_max_schedule_inst_df_sort["count_open_orders"] = nr_open_orders_list

In [138]:
due_max_schedule_inst_df_sort

Unnamed: 0,instalment_plan_id,max_scheduled,scheduled,customer_id,row_number,count_open_orders
6944,47958,2021-12-31,2021-12-31,64.0,1.0,0
7245,48388,2022-01-01,2022-01-01,64.0,2.0,1
3397,42507,2021-12-16,2021-12-16,68.0,1.0,0
3410,42533,2021-12-16,2021-12-16,71.0,1.0,0
16108,60971,2022-01-25,2022-01-25,71.0,2.0,1
...,...,...,...,...,...,...
16386,61330,2022-01-25,2022-01-25,49895.0,1.0,0
16396,61348,2022-01-25,2022-01-25,49915.0,1.0,0
16398,61350,2022-01-25,2022-01-25,49917.0,1.0,0
16400,61354,2022-01-25,2022-01-25,49925.0,1.0,0


# Number of paid instalments

In [577]:
inst_status_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(instalments_df[['instalment_plan_id', 'status', 'total', 'scheduled']], how='left', on='instalment_plan_id')
inst_status_paid_df = inst_status_df[inst_status_df['status'] == 'paid']

In [580]:
paid_inst_plan_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(inst_status_paid_df[['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').count().reset_index(), how='left', on='instalment_plan_id')

In [582]:
paid_inst_plan_df.fillna(0, inplace=True)

In [584]:
paid_inst_plan_df["row_number"] = partitioned_row_number(
    df=paid_inst_plan_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

In [585]:
paid_inst_plan_df_sort = paid_inst_plan_df.sort_values(["customer_id", "row_number"])

In [587]:
nr_paid_inst_list = []
for i in paid_inst_plan_df_sort['customer_id'].unique():
    for d in paid_inst_plan_df_sort[paid_inst_plan_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = paid_inst_plan_df_sort[(paid_inst_plan_df_sort['customer_id'] == i) & (paid_inst_plan_df_sort['row_number'] < d)]['status'].sum()
        nr_paid_inst_list.append(current_aov)

In [588]:
paid_inst_plan_df_sort["count_paid_instalments"] = nr_paid_inst_list

In [377]:
paid_inst_plan_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,status,row_number,count_paid_instalments
27113,47958,64.0,2021-10-31 10:02:57.280467,0.0,1.0,0.0
11257,48388,64.0,2021-11-01 11:44:17.953913,0.0,2.0,0.0
11256,48414,64.0,2021-11-01 12:24:36.978093,2.0,3.0,0.0
8576,6419,68.0,2021-02-02 18:16:01.287941,2.0,1.0,0.0
8578,10058,68.0,2021-03-22 08:08:12.476035,0.0,2.0,2.0
...,...,...,...,...,...,...
14709,61330,49895.0,2021-11-25 22:17:26.492825,0.0,1.0,0.0
32477,61348,49915.0,2021-11-25 23:01:43.810158,0.0,1.0,0.0
24934,61350,49917.0,2021-11-25 23:04:20.693576,0.0,1.0,0.0
5349,61354,49925.0,2021-11-25 23:23:49.199569,0.0,1.0,0.0


# Number of unpaid instalments

In [379]:
inst_status_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(instalments_df[['instalment_plan_id', 'status', 'total', 'scheduled']], how='left', on='instalment_plan_id')
inst_status_unpaid_df = inst_status_df[inst_status_df['status'] == 'unpaid']

In [380]:
unp_inst_plan_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(inst_status_unpaid_df[['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').count().reset_index(), how='left', on='instalment_plan_id')

In [391]:
unp_inst_plan_df.fillna(0, inplace=True)

In [392]:
unp_inst_plan_df["row_number"] = partitioned_row_number(
    df=unp_inst_plan_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

In [395]:
unp_inst_plan_df_sort = unp_inst_plan_df.sort_values(["customer_id", "row_number"])

In [397]:
nr_unpaid_inst_list = []
for i in unp_inst_plan_df_sort['customer_id'].unique():
    for d in unp_inst_plan_df_sort[unp_inst_plan_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = unp_inst_plan_df_sort[(unp_inst_plan_df_sort['customer_id'] == i) & (unp_inst_plan_df_sort['row_number'] < d)]['status'].sum()
        nr_unpaid_inst_list.append(current_aov)

In [401]:
unp_inst_plan_df_sort["count_unpaid_instalments"] = nr_unpaid_inst_list

In [402]:
unp_inst_plan_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,status,row_number,count_paid_instalments,count_unpaid_instalments
27113,47958,64.0,2021-10-31 10:02:57.280467,0.0,1.0,0.0,0.0
11257,48388,64.0,2021-11-01 11:44:17.953913,0.0,2.0,0.0,0.0
11256,48414,64.0,2021-11-01 12:24:36.978093,0.0,3.0,0.0,0.0
8576,6419,68.0,2021-02-02 18:16:01.287941,0.0,1.0,0.0,0.0
8578,10058,68.0,2021-03-22 08:08:12.476035,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...
14709,61330,49895.0,2021-11-25 22:17:26.492825,0.0,1.0,0.0,0.0
32477,61348,49915.0,2021-11-25 23:01:43.810158,0.0,1.0,0.0,0.0
24934,61350,49917.0,2021-11-25 23:04:20.693576,0.0,1.0,0.0,0.0
5349,61354,49925.0,2021-11-25 23:23:49.199569,0.0,1.0,0.0,0.0


# Number of paid orders

In [594]:
not_paid_status_df = instalments_df.merge(instalments_df[~instalments_df['status'].isin(['paid'])][['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').min().reset_index().rename(columns={'status': 'not_paid_status'}), how='left', on='instalment_plan_id')

In [599]:
not_paid_status_df

Unnamed: 0,order,refunded_amount,penalty_fee,amount,instalment_plan_id,scheduled,completed,status,total,not_paid_status
0,0,0.0,0.0,215.00,60690,2021-12-25,NaT,due,215.00,due
1,1,0.0,0.0,215.00,60690,2022-01-25,NaT,due,215.00,due
2,1,0.0,0.0,87.33,15483,2021-07-07,2021-07-07 07:00:39.738724,paid,87.33,
3,0,0.0,0.0,87.33,15483,2021-06-07,2021-06-07 07:00:34.036459,paid,87.33,
4,1,0.0,25.0,155.75,13863,2021-06-26,2021-06-30 08:30:00.712821,paid,180.75,
...,...,...,...,...,...,...,...,...,...,...
109670,1,0.0,0.0,163.00,16096,2021-07-11,2021-07-11 06:44:39.664860,paid,163.00,
109671,0,0.0,0.0,262.83,11965,2021-05-12,2021-05-12 17:10:44.014867,paid,262.83,
109672,1,0.0,0.0,262.83,11965,2021-06-12,2021-06-12 17:11:40.222883,paid,262.83,
109673,0,0.0,0.0,35.33,35464,2021-10-24,2021-10-20 19:57:55.790414,paid,35.33,


In [600]:
fully_paid_df = not_paid_status_df[(not_paid_status_df['status'] == 'paid') & (pd.isnull(not_paid_status_df['not_paid_status']))]

In [601]:
fully_paid_df

Unnamed: 0,order,refunded_amount,penalty_fee,amount,instalment_plan_id,scheduled,completed,status,total,not_paid_status
2,1,0.0,0.0,87.33,15483,2021-07-07,2021-07-07 07:00:39.738724,paid,87.33,
3,0,0.0,0.0,87.33,15483,2021-06-07,2021-06-07 07:00:34.036459,paid,87.33,
4,1,0.0,25.0,155.75,13863,2021-06-26,2021-06-30 08:30:00.712821,paid,180.75,
5,0,0.0,0.0,155.75,13863,2021-05-26,2021-05-26 09:44:42.603487,paid,155.75,
6,1,0.0,0.0,19.12,29505,2021-10-26,2021-09-26 15:51:35.875556,paid,19.12,
...,...,...,...,...,...,...,...,...,...,...
109670,1,0.0,0.0,163.00,16096,2021-07-11,2021-07-11 06:44:39.664860,paid,163.00,
109671,0,0.0,0.0,262.83,11965,2021-05-12,2021-05-12 17:10:44.014867,paid,262.83,
109672,1,0.0,0.0,262.83,11965,2021-06-12,2021-06-12 17:11:40.222883,paid,262.83,
109673,0,0.0,0.0,35.33,35464,2021-10-24,2021-10-20 19:57:55.790414,paid,35.33,


In [608]:
fully_paid_inst_df = fully_paid_df[['instalment_plan_id', 'status', 'completed']].groupby(by='instalment_plan_id').max().reset_index()

In [609]:
fully_paid_inst_df

Unnamed: 0,instalment_plan_id,status,completed
0,184,paid,2020-05-10 10:11:52.850072
1,189,paid,2020-07-31 02:00:35.167882
2,193,paid,2020-06-29 20:03:00.223891
3,194,paid,2020-08-27 08:14:56.671049
4,195,paid,2020-08-05 02:00:30.037895
...,...,...,...
22347,60615,paid,2021-11-25 23:45:23.111594
22348,60839,paid,2021-11-25 09:14:12.625073
22349,60885,paid,2021-11-25 12:36:40.965353
22350,60901,paid,2021-11-25 17:02:57.006848


In [610]:
order_status_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(fully_paid_inst_df, how='left', on='instalment_plan_id')
order_status_paid_df = order_status_df[order_status_df['status'] == 'paid']

In [611]:
order_status_paid_df

Unnamed: 0,instalment_plan_id,customer_id,created,status,completed
3,41903,22674.0,2021-10-13 22:27:31.449194,paid,2021-11-25 06:42:29.772191
6,43819,32082.0,2021-10-20 19:47:18.700882,paid,2021-11-09 13:03:12.952617
9,35578,23656.0,2021-09-24 12:00:29.311185,paid,2021-11-24 12:11:08.152734
16,32870,21866.0,2021-09-11 16:59:59.335408,paid,2021-11-11 17:00:31.965710
22,17377,11165.0,2021-05-17 20:49:29.874242,paid,2021-06-26 05:28:15.893411
...,...,...,...,...,...
35070,26657,16941.0,2021-08-03 08:52:49.507915,paid,2021-10-02 14:34:09.959735
35071,7547,4740.0,2021-02-17 16:25:17.712914,paid,2021-04-17 02:00:45.417184
35075,22501,5501.0,2021-07-03 05:36:06.575462,paid,2021-09-03 05:44:48.635350
35076,25184,5501.0,2021-07-24 17:43:40.258921,paid,2021-09-24 17:44:47.875461


In [612]:
order_status_paid_df[order_status_paid_df["customer_id"] == 6171]

Unnamed: 0,instalment_plan_id,customer_id,created,status,completed
10593,11306,6171.0,2021-04-06 13:58:14.349328,paid,2021-04-08 15:29:11.988935
10594,11308,6171.0,2021-04-06 14:35:38.083231,paid,2021-04-07 14:04:51.646297
10595,10112,6171.0,2021-03-22 20:42:19.962977,paid,2021-04-08 14:53:38.025220
10603,10167,6171.0,2021-03-23 21:35:42.896228,paid,2021-04-07 14:24:11.834991


In [613]:
order_status_paid_df["row_number"] = partitioned_row_number(
    df=order_status_paid_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_status_paid_df["row_number"] = partitioned_row_number(


In [614]:
order_status_paid_df_sort = order_status_paid_df.sort_values(["customer_id", "row_number"])

In [616]:
order_status_paid_df_sort[order_status_paid_df_sort["customer_id"] == 6171]

Unnamed: 0,instalment_plan_id,customer_id,created,status,completed,row_number
10595,10112,6171.0,2021-03-22 20:42:19.962977,paid,2021-04-08 14:53:38.025220,1.0
10603,10167,6171.0,2021-03-23 21:35:42.896228,paid,2021-04-07 14:24:11.834991,2.0
10593,11306,6171.0,2021-04-06 13:58:14.349328,paid,2021-04-08 15:29:11.988935,3.0
10594,11308,6171.0,2021-04-06 14:35:38.083231,paid,2021-04-07 14:04:51.646297,4.0


In [297]:
nr_paid_orders_list = []
for i in order_status_paid_df_sort['customer_id'].unique():
    for d in order_status_paid_df_sort[order_status_paid_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_nr_orders = order_status_paid_df_sort[(order_status_paid_df_sort['customer_id'] == i) & (order_status_paid_df_sort['row_number'] < d)]['customer_id'].count()
        nr_paid_orders_list.append(current_nr_orders)

In [684]:
import datetime

In [701]:
nr_paid_orders_list = []
for i in [6171]:
    for d in order_status_paid_df_sort[order_status_paid_df_sort['customer_id'] == i]['row_number']:
        current_created = pd.to_datetime(order_status_paid_df_sort[(order_status_paid_df_sort['customer_id'] == i) & (order_status_paid_df_sort['row_number'] == d)]["created"])
        subset_df = order_status_paid_df_sort[(order_status_paid_df_sort['customer_id'] == i) & (order_status_paid_df_sort['row_number'] < d)]
        # print(subset_df)
        if subset_df.empty:
            current_nr_orders = 0
            nr_paid_orders_list.append(current_nr_orders)
            print(current_nr_orders)
        else:
            for x in subset_df['row_number']:
                completed_date = pd.to_datetime(subset_df[subset_df['row_number'] == x]["completed"])
                time_delta = (current_created - completed_date).dt.days
                print(time_delta)


            # datetime.datetime.today()
            # g = 0
            # for x in subset_df['row_number']:
            #     print(pd.to_datetime(subset_df[subset_df['row_number'] == x]["completed"]))
    
            #     current_created_date = pd.to_datetime(current_created)
            #     completed_date = pd.to_datetime(subset_df[subset_df['row_number'] == x]["completed"])
            #     print(current_created_date)
            #     date_diff = current_created_date - completed_date
            #     print(date_diff)
            #     print("completed_date")
        #     if np.where(pd.isnull(date_diff), 0, date_diff) > 0:
        #         g += 1

        # print("after for loop")
        # print(g)
        # max_completed = subset_df["completed"].max()

        # print(max_completed)
        # print(order_status_paid_df_sort[(order_status_paid_df_sort['customer_id'] == i) & (order_status_paid_df_sort['row_number'] == d)]["created"] > max_completed)
        # # Number of merchants
        # current_aov = order_status_paid_df_sort[(order_status_paid_df_sort['customer_id'] == i) & (order_status_paid_df_sort['row_number'] < d)]['customer_id'].count()
        # nr_paid_orders_list.append(current_aov)

0


TypeError: cannot convert the series to <class 'int'>

In [667]:
nr_paid_orders_list

[0]

In [298]:
order_status_paid_df_sort["count_paid_orders"] = nr_paid_orders_list

In [590]:
order_status_paid_df_sort[order_status_paid_df_sort["customer_id"] == 6171]

Unnamed: 0,instalment_plan_id,customer_id,created,status,row_number,count_paid_orders
10595,10112,6171.0,2021-03-22 20:42:19.962977,paid,1.0,0
10603,10167,6171.0,2021-03-23 21:35:42.896228,paid,2.0,1
10593,11306,6171.0,2021-04-06 13:58:14.349328,paid,3.0,2
10594,11308,6171.0,2021-04-06 14:35:38.083231,paid,4.0,3


In [299]:
order_status_paid_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,status,row_number,count_paid_orders
11256,48414,64.0,2021-11-01 12:24:36.978093,paid,1.0,0
8576,6419,68.0,2021-02-02 18:16:01.287941,paid,1.0,0
8570,10425,68.0,2021-03-27 15:07:28.983873,paid,2.0,1
8574,11228,68.0,2021-04-05 16:56:13.197691,paid,3.0,2
8569,20326,68.0,2021-06-14 19:49:59.043486,paid,4.0,3
...,...,...,...,...,...,...
27424,60839,45357.0,2021-11-25 08:57:38.865943,paid,1.0,0
462,58366,45910.0,2021-11-20 19:47:54.221345,paid,1.0,0
20536,59670,47457.0,2021-11-23 09:57:59.444275,paid,1.0,0
7043,59883,47760.0,2021-11-23 16:05:59.683205,paid,1.0,0


# Number of unpaid orders

In [234]:
unpaid_status_df = instalments_df[instalments_df['status'] == 'unpaid'][['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').min().reset_index().rename(columns={'status': 'unpaid_status'})

In [241]:
order_status_unp_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(unpaid_status_df, how='left', on='instalment_plan_id')
order_status_unpaid_df = order_status_unp_df[order_status_unp_df['unpaid_status'] == 'unpaid']

In [243]:
order_status_unpaid_df["row_number"] = partitioned_row_number(
    df=order_status_unpaid_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_status_unpaid_df["row_number"] = partitioned_row_number(


In [245]:
order_status_unpaid_df_sort = order_status_unpaid_df.sort_values(["customer_id", "row_number"])

In [247]:
nr_unpaid_orders_list = []
for i in order_status_unpaid_df_sort['customer_id'].unique():
    for d in order_status_unpaid_df_sort[order_status_unpaid_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = order_status_unpaid_df_sort[(order_status_unpaid_df_sort['customer_id'] == i) & (order_status_unpaid_df_sort['row_number'] < d)]['customer_id'].count()
        nr_unpaid_orders_list.append(current_aov)

In [248]:
order_status_unpaid_df_sort["count_unpaid_orders"] = nr_unpaid_orders_list

In [249]:
order_status_unpaid_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,unpaid_status,row_number,count_unpaid_orders
21067,37647,722.0,2021-09-30 21:54:51.952733,unpaid,1.0,0
31651,42254,805.0,2021-10-15 10:44:23.467338,unpaid,1.0,0
12416,26653,824.0,2021-08-03 08:34:03.213251,unpaid,1.0,0
12419,29494,824.0,2021-08-25 23:52:10.236232,unpaid,2.0,1
23916,44273,1059.0,2021-10-22 08:23:15.184622,unpaid,1.0,0
...,...,...,...,...,...,...
32786,45392,33405.0,2021-10-25 20:20:04.362643,unpaid,1.0,0
21998,45393,33408.0,2021-10-25 20:21:13.266821,unpaid,1.0,0
12144,45444,33466.0,2021-10-25 23:20:55.139584,unpaid,1.0,0
12899,45447,33468.0,2021-10-25 23:34:28.196677,unpaid,1.0,0


# The sum of outstanding captured debt

In [253]:
cid_status_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id']].merge(instalments_df[['instalment_plan_id', 'status', 'total']], how='left', on='instalment_plan_id')
cid_status_due_df = cid_status_df[cid_status_df['status'] == 'due']

In [258]:
inst_out_df = cid_status_due_df[['instalment_plan_id', 'total']].groupby(by='instalment_plan_id').sum().reset_index().rename(columns={'total': 'total_outstanding_debt'})

In [259]:
inst_plan_out_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created']].merge(inst_out_df, how='left', on='instalment_plan_id')

In [261]:
inst_plan_out_df["row_number"] = partitioned_row_number(
    df=inst_plan_out_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

In [263]:
inst_plan_out_df_sort = inst_plan_out_df.sort_values(["customer_id", "row_number"])

In [268]:
sum_outstanding_debt_list = []
for i in inst_plan_out_df_sort['customer_id'].unique():
    for d in inst_plan_out_df_sort[inst_plan_out_df_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = inst_plan_out_df_sort[(inst_plan_out_df_sort['customer_id'] == i) & (inst_plan_out_df_sort['row_number'] < d)]['total_outstanding_debt'].sum()
        sum_outstanding_debt_list.append(current_aov)

In [271]:
inst_plan_out_df_sort["current_exposure"] = sum_outstanding_debt_list

In [272]:
inst_plan_out_df_sort

Unnamed: 0,instalment_plan_id,customer_id,created,total_outstanding_debt,row_number,count_unpaid_orders,current_exposure
27113,47958,64.0,2021-10-31 10:02:57.280467,60.66,1.0,0.00,0.00
11257,48388,64.0,2021-11-01 11:44:17.953913,15.34,2.0,60.66,60.66
11256,48414,64.0,2021-11-01 12:24:36.978093,,3.0,76.00,76.00
8576,6419,68.0,2021-02-02 18:16:01.287941,,1.0,0.00,0.00
8578,10058,68.0,2021-03-22 08:08:12.476035,,2.0,0.00,0.00
...,...,...,...,...,...,...,...
14709,61330,49895.0,2021-11-25 22:17:26.492825,361.34,1.0,0.00,0.00
32477,61348,49915.0,2021-11-25 23:01:43.810158,552.00,1.0,0.00,0.00
24934,61350,49917.0,2021-11-25 23:04:20.693576,399.34,1.0,0.00,0.00
5349,61354,49925.0,2021-11-25 23:23:49.199569,166.66,1.0,0.00,0.00


# The total order sum amount

In [287]:
not_paid_status_df = instalments_df.merge(instalments_df[~instalments_df['status'].isin(['paid'])][['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').min().reset_index().rename(columns={'status': 'not_paid_status'}), how='left', on='instalment_plan_id')

fully_paid_df = not_paid_status_df[(not_paid_status_df['status'] == 'paid') & (pd.isnull(not_paid_status_df['not_paid_status']))]

fully_paid_inst_df = fully_paid_df[['instalment_plan_id', 'status']].groupby(by='instalment_plan_id').min().reset_index()

order_status_df = pi3_ae_instalment_plans_df[['instalment_plan_id', 'customer_id', 'created', 'total_amount']].merge(fully_paid_inst_df, how='left', on='instalment_plan_id')
order_status_paid_df_sum = order_status_df[order_status_df['status'] == 'paid']

order_status_paid_df_sum["row_number"] = partitioned_row_number(
    df=order_status_paid_df_sum,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["created"]
)

order_status_paid_df_sum_sort = order_status_paid_df_sum.sort_values(["customer_id", "row_number"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_status_paid_df_sum["row_number"] = partitioned_row_number(


In [288]:
sum_paid_orders_list = []
for i in order_status_paid_df_sum_sort['customer_id'].unique():
    for d in order_status_paid_df_sum_sort[order_status_paid_df_sum_sort['customer_id'] == i]['row_number']:
        # Number of merchants
        current_aov = order_status_paid_df_sum_sort[(order_status_paid_df_sum_sort['customer_id'] == i) & (order_status_paid_df_sum_sort['row_number'] < d)]['total_amount'].sum()
        sum_paid_orders_list.append(current_aov)

In [289]:
order_status_paid_df_sum_sort["sum_paid_amount"] = sum_paid_orders_list

# Days since last unpaid

In [553]:
cid_scheduled_status_df = pi3_ae_instalment_plans_df.loc[:,['instalment_plan_id', 'customer_id', 'created']].merge(instalments_df.loc[:,['instalment_plan_id', 'scheduled', 'status', 'total']], how='left', on='instalment_plan_id')
cid_scheduled_status_unpaid_df = cid_scheduled_status_df.loc[cid_status_df.loc[:,'status'] == 'unpaid']

In [555]:
last_unpaid_df = cid_scheduled_status_unpaid_df.groupby(by='instalment_plan_id')["scheduled"].max().reset_index()

In [570]:
last_inst_df = cid_scheduled_status_df.groupby(by='instalment_plan_id')["scheduled"].max().reset_index()

In [557]:
cid_scheduled_status_last_unpaid_df = last_unpaid_df.merge(cid_scheduled_status_unpaid_df, how="left", on=["instalment_plan_id", "scheduled"])

In [572]:
cid_scheduled_status_last_inst_df = last_inst_df.merge(cid_scheduled_status_df, how="left", on=["instalment_plan_id", "scheduled"])

In [559]:
cid_scheduled_status_last_unpaid_df.loc[:,"row_number"] = partitioned_row_number(
    df=cid_scheduled_status_last_unpaid_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["scheduled"]
)

In [574]:
cid_scheduled_status_last_inst_df.loc[:,"row_number"] = partitioned_row_number(
    df=cid_scheduled_status_last_inst_df,
    groupby_col_list=["customer_id"],
    sort_by_col_list=["scheduled"]
)

In [560]:
cid_scheduled_status_last_unpaid_df_sort = cid_scheduled_status_last_unpaid_df.sort_values(["customer_id", "row_number"])

In [575]:
cid_scheduled_status_last_inst_df_sort = cid_scheduled_status_last_inst_df.sort_values(["customer_id", "row_number"])

In [562]:
cid_scheduled_status_last_unpaid_df_sort.loc[:,"scheduled_lag"] = cid_scheduled_status_last_unpaid_df_sort.groupby(by="customer_id")["scheduled"].shift(1)

In [566]:
# days_since_last_unpaid_df = days_from_today(
#     df = cid_scheduled_status_last_unpaid_df_sort,
#     new_col_name = 'days_since_last_unpaid',
#     date_col = 'scheduled_lag',
#     drop_orig_col = True
#     )

days_since_last_unpaid_df = two_date_cols_diff(
    df = cid_scheduled_status_last_unpaid_df_sort,
    new_col_name = "days_since_last_unpaid",
    first_col = "created",
    second_col= "scheduled_lag"
)

In [518]:
days_since_last_unpaid_df.loc[pd.notnull(days_since_last_unpaid_df.loc[:,"days_since_last_unpaid"])]

Unnamed: 0,instalment_plan_id,scheduled,customer_id,status,total,row_number,days_since_last_unpaid
300,29494,2021-10-25,824.0,unpaid,823.00,2.0,58.0
167,21987,2021-08-31,1213.0,unpaid,212.33,2.0,106.0
745,45002,2021-11-25,1481.0,unpaid,198.33,2.0,22.0
135,21071,2021-08-21,4227.0,unpaid,1427.35,2.0,115.0
177,22773,2021-09-04,4227.0,unpaid,1963.00,3.0,101.0
...,...,...,...,...,...,...,...
741,44940,2021-11-24,29859.0,unpaid,65.00,3.0,6.0
669,43458,2021-11-19,31159.0,unpaid,353.67,2.0,13.0
707,44306,2021-11-22,31159.0,unpaid,413.67,3.0,11.0
711,44402,2021-11-22,31159.0,unpaid,130.67,4.0,8.0


# Merge everything

In [520]:
behavioural_instalment_plan_df = pi3_ae_instalment_plans_id_df.merge(
    aov_tmp_df_sort[['instalment_plan_id', 'created', 'total_amount', 'avg_order_value']], how='left', on='instalment_plan_id'
    ).merge(cid_avg_fees_df_sort[['instalment_plan_id', 'avg_fees_per_order_30d', 'avg_fees_per_order_90d', 'avg_fees_per_order_180d', 'avg_fees_per_order_365d']], how='left', on='instalment_plan_id'
    ).merge(nr_merch_tmp_df_sort[['instalment_plan_id', 'count_merchants_per_customer']], how='left', on='instalment_plan_id'
    ).merge(due_max_schedule_inst_df_sort[['instalment_plan_id', 'count_open_orders']], how='left', on='instalment_plan_id'
    ).merge(paid_inst_plan_df_sort[['instalment_plan_id', 'count_paid_instalments']], how='left', on='instalment_plan_id'
    ).merge(unp_inst_plan_df_sort[['instalment_plan_id', 'count_unpaid_instalments']], how='left', on='instalment_plan_id'
    ).merge(order_status_paid_df_sort[['instalment_plan_id', 'count_paid_orders']], how='left', on='instalment_plan_id'
    ).merge(order_status_unpaid_df_sort[['instalment_plan_id', 'count_unpaid_orders']], how='left', on='instalment_plan_id'
    ).merge(inst_plan_out_df_sort[['instalment_plan_id', 'current_exposure']], how='left', on='instalment_plan_id'
    ).merge(order_status_paid_df_sum_sort[['instalment_plan_id', 'sum_paid_amount']], how='left', on='instalment_plan_id'
    ).merge(filtered_customers_df, how='left', on='customer_id'
    ).merge(days_since_last_unpaid_df[['instalment_plan_id', 'days_since_last_unpaid']], how='left', on='instalment_plan_id'
    )

In [521]:
behavioural_instalment_plan_df

Unnamed: 0,customer_id,instalment_plan_id,order_id,created,total_amount,avg_order_value,avg_fees_per_order_30d,avg_fees_per_order_90d,avg_fees_per_order_180d,avg_fees_per_order_365d,count_merchants_per_customer,count_open_orders,count_paid_instalments,count_unpaid_instalments,count_paid_orders,count_unpaid_orders,current_exposure,sum_paid_amount,customer_first_joined,date_of_birth,days_since_last_unpaid
0,31808.0,43420,84374,2021-10-19 16:40:32.476678,191.00,,,,,,0,0.0,0.0,0.0,,,0.00,,2021-10-19 16:39:12.250809,,
1,31808.0,46693,90779,2021-10-28 14:02:23.620032,280.00,191.000,0.0,0.0,0.0,0.0,1,1.0,1.0,0.0,,,63.67,,2021-10-19 16:39:12.250809,,
2,22674.0,45940,89259,2021-10-27 04:52:04.402399,920.00,3589.500,0.0,0.0,0.0,0.0,2,0.0,4.0,0.0,,,0.00,,2021-09-16 18:55:26.896645,1986-02-09,
3,22674.0,41903,81244,2021-10-13 22:27:31.449194,485.00,6694.000,0.0,0.0,0.0,0.0,1,,2.0,0.0,1.0,,0.00,6694.00,2021-09-16 18:55:26.896645,1986-02-09,
4,41251.0,54188,104657,2021-11-12 13:02:56.677539,108.00,,,,,,0,0.0,0.0,0.0,,,0.00,,2021-11-12 13:01:25.774520,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35074,25888.0,47337,91929,2021-10-30 00:12:20.262547,844.55,719.100,0.0,0.0,0.0,0.0,1,1.0,1.0,0.0,,,239.70,,2021-09-30 07:19:44.798826,,
35075,5501.0,22501,42368,2021-07-03 05:36:06.575462,156.60,257.250,,,0.0,0.0,1,,2.0,0.0,1.0,,0.00,257.25,2021-03-01 19:38:51.445106,1988-11-30,
35076,5501.0,25184,47129,2021-07-24 17:43:40.258921,100.00,206.925,0.0,0.0,0.0,0.0,2,,4.0,0.0,2.0,,0.00,413.85,2021-03-01 19:38:51.445106,1988-11-30,
35077,45830.0,58713,112317,2021-11-21 13:36:43.336510,114.00,,,,,,0,0.0,0.0,0.0,,,0.00,,2021-11-20 16:47:03.808564,1993-05-29,
