In [1]:
# Library
import pandas as pd
import numpy as np
import calendar
import copy
import datetime
# import multiprocessing
# import helper as h
from tqdm import tqdm
# from itertools import repeat

In [2]:
df = pd.read_csv('daily_transactions.csv', sep=',')
print(df.shape)
df.head(5)

(1210998, 3)


Unnamed: 0,_id,event_date,usertype
0,By1+rEy20nW/sgRehb+RSZe9VI8=,2021-02-10,A
1,x7lxSW+y6Kymzxf1yrriMXDNZWQ=,2021-02-10,A
2,BQ7feJAnc6ntg4PaOXShHLrd3xQ=,2021-02-24,A
3,xmc9CsW3Q7K9HK5+pczuRvKUCBk=,2020-12-08,A
4,Dt0y6YCKnC/mD9Sm1SkNkwsrgzY=,2020-06-17,A


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210998 entries, 0 to 1210997
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   _id         1210998 non-null  object
 1   event_date  1210998 non-null  object
 2   usertype    1210998 non-null  object
dtypes: object(3)
memory usage: 27.7+ MB


In [4]:
# Convert column from obj to datetime
df['event_date'] = pd.to_datetime(df['event_date'])

# Sort dataframe
df = df[df['event_date'] <= '2021-04-30']
df = df.sort_values(by=['_id', 'event_date'])

In [5]:
# unique_id = list(df['_id'].unique())

# with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
#     index = pool.starmap(h.check_usertype, zip(unique_id, repeat(df)))

# # Drop index
# df.drop(index)

In [6]:
# Store result
months = list(calendar.month_name[1:])
years = [2020, 2021]
newUser = {}

# Create dict to store new user
for y in years:
    for m in months:
        newUser["{}-{}".format(m, y)] = 0

# Copy
dropOff = copy.deepcopy(newUser)
returnUser = copy.deepcopy(newUser)

In [7]:
# Convert column from obj to datetime
df['event_date'] = pd.to_datetime(df['event_date'])

# Sort dataframe and drop duplicate
df_ft = df.drop_duplicates(subset=['_id'], keep='first')
df_ft = df_ft.sort_values(by='event_date')
df_ft = df_ft[df_ft['event_date'] >= '2020-04-01']

# Add columns
df_ft['first_tran'] = True

# Count new user for each months
for _, row in tqdm(df_ft.iterrows()):
    newUser["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1

# New users
newUser = {k:v for k,v in newUser.items() if v != 0}
newUser

311386it [00:14, 21044.58it/s]


{'April-2020': 15117,
 'May-2020': 20295,
 'June-2020': 22949,
 'July-2020': 30152,
 'August-2020': 28019,
 'September-2020': 21693,
 'October-2020': 22854,
 'November-2020': 18875,
 'December-2020': 19071,
 'January-2021': 23389,
 'February-2021': 22676,
 'March-2021': 31369,
 'April-2021': 34927}

In [8]:
# Sort dataframe and drop duplicate
df_lt_churn = df.drop_duplicates(subset=['_id'], keep='last')
df_lt_churn = df_lt_churn.sort_values(by='event_date')
df_lt_store = df_lt_churn[df_lt_churn['event_date'] >= '2020-04-01']

# Churn periods
churn = {'A': 360, 'B': 360, 'C': 120, 'D': 260}

# Add the churn values to all of the row
for idx, row in tqdm(df_lt_churn.iterrows()):
    df_lt_churn.at[idx, 'event_date'] = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])

# Filter
df_lt_churn = df_lt_churn[(df_lt_churn['event_date'] >= '2020-04-01') & (df_lt_churn['event_date'] <= '2021-04-30')]

# Count drop off user for each months
for _, row in tqdm(df_lt_churn.iterrows()):
    dropOff["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1

# Old users
dropOff

432458it [00:38, 11317.63it/s]
131718it [00:08, 15630.48it/s]


{'January-2020': 0,
 'February-2020': 0,
 'March-2020': 0,
 'April-2020': 675,
 'May-2020': 7646,
 'June-2020': 10327,
 'July-2020': 11909,
 'August-2020': 10768,
 'September-2020': 9886,
 'October-2020': 9867,
 'November-2020': 9457,
 'December-2020': 9058,
 'January-2021': 12710,
 'February-2021': 12347,
 'March-2021': 12049,
 'April-2021': 15019,
 'May-2021': 0,
 'June-2021': 0,
 'July-2021': 0,
 'August-2021': 0,
 'September-2021': 0,
 'October-2021': 0,
 'November-2021': 0,
 'December-2021': 0}

In [9]:
# Add columns
df_lt_store['last_tran'] = True

# Filter
df = df[(df['event_date'] >= '2020-04-01') & (df['event_date'] <= '2021-04-30')]

# Merge
df_merge = pd.merge(df_ft, df_lt_store, how="outer", on=['_id', 'event_date', 'usertype'])
df_merge = pd.merge(df_merge, df, how="outer", on=['_id', 'event_date', 'usertype'])
df_merge = df_merge.replace(np.nan, False)

# Remove value with single transaction (true & true)
df_merge = df_merge[(df_merge['first_tran'] != True) | (df_merge['last_tran'] != True)]
df_merge = df_merge[(df_merge['first_tran'] != True)]
df_merge

Unnamed: 0,_id,event_date,usertype,first_tran,last_tran
311386,6Y4Jc+NdpKhtg/GB6li2zdilNro=,2020-04-01,B,False,True
311387,zQlkXb8r1yWw6PIMVe3zjF032V0=,2020-04-01,B,False,True
311388,52DpInrChwpjD+mkPiQITr+HiNk=,2020-04-01,B,False,True
311389,vliK6hTpjySTNuGaPrm/vtl7/Fk=,2020-04-01,B,False,True
311390,Q89Z07wROR9XJadMiO0D7sto/Fc=,2020-04-01,B,False,True
...,...,...,...,...,...
895423,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-02-26,A,False,False
895424,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-02-28,A,False,False
895425,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-03-01,A,False,False
895426,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-03-02,A,False,False


In [10]:
# Sort dataframe and drop duplicate
df_ft_churn = df.drop_duplicates(subset=['_id'], keep='first')
df_ft_churn = df_ft_churn.sort_values(by='event_date')

# Filter
df_ft_churn = df_ft_churn[df_ft_churn['event_date'] <= '2021-04-30']
df_ft_churn.sort_values(by='event_date')

Unnamed: 0,_id,event_date,usertype
833417,q1/0IoEgxi/BmKwRHp2xy6T1JQM=,2020-04-01,B
1053505,4h+NeisC36wf+wpTUmwH/PCvRd8=,2020-04-01,B
799307,zmGmxnhpigxNAEsqn1bKFKiy7VI=,2020-04-01,B
922534,/BFF0F9L6vZhUlsBW7mWgykToDU=,2020-04-01,B
51526,91T/AvqW+P88nPYrtpQlUwr2byg=,2020-04-01,A
...,...,...,...
581720,elddre5JSZM209JePiSTPRg5xCg=,2021-04-30,B
559437,tDPcSodx3YqWiLzT9WV5IzXRMls=,2021-04-30,B
533675,4SFC/ZCvrScWYtCzVaCAbImnTi8=,2021-04-30,B
432040,woZ+TJLXF46DFAn1AGF8OcuYxMU=,2021-04-30,B


In [11]:
# Convert to numpy array
user_ids = list(df_merge['_id'].unique())
np_merge = df_merge.to_numpy()
np_ft_churn = df_ft_churn.to_numpy()

In [12]:
for uid in tqdm(user_ids):
    i = np.where(np_merge == uid)[0]
    v = np.where(np_ft_churn == uid)[0]
    np_temp = np_merge[i]
    np_temp_ft = np_ft_churn[v]

    # Check if single
    if len(np_temp) <= 1:
        continue

    
    for r in np_temp:
        if r[1] < churn_period:
            continue
        
        else:
            dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
            returnUser["{}-{}".format(calendar.month_name[r[1].month], r[1].year)] += 1

            if r[4] == False:
                churn_period = r[1] + datetime.timedelta(days=churn[r[2]])

returnUser = {k:v for k,v in returnUser.items() if v != 0}
returnUser

100%|██████████| 173266/173266 [4:03:46<00:00, 11.85it/s]  


{'April-2020': 8946,
 'May-2020': 10427,
 'June-2020': 10654,
 'July-2020': 14152,
 'August-2020': 14820,
 'September-2020': 11405,
 'October-2020': 12123,
 'November-2020': 10177,
 'December-2020': 10539,
 'January-2021': 12932,
 'February-2021': 13036,
 'March-2021': 18686,
 'April-2021': 26333}

In [17]:
returnUser = {k:v for k,v in returnUser.items() if v != 0}
returnUser

{'April-2020': 8946,
 'May-2020': 10427,
 'June-2020': 10654,
 'July-2020': 14152,
 'August-2020': 14820,
 'September-2020': 11405,
 'October-2020': 12123,
 'November-2020': 10177,
 'December-2020': 10539,
 'January-2021': 12932,
 'February-2021': 13036,
 'March-2021': 18686,
 'April-2021': 26333}

In [13]:
# # Run multiprocessing
# with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
#     result = pool.starmap(h.find_return_user, zip(user_ids, repeat(np_merge), repeat(np_ft_churn), repeat(dropOff), repeat(returnUser)))    

# # Print result
# returnUser = {k:v for k,v in result[1].items() if v != 0}
# returnUser

In [14]:
# Print result
dropOff = {k:v for k,v in dropOff.items() if v != 0}
dropOff

{'April-2020': 15679,
 'May-2020': 21396,
 'June-2020': 23823,
 'July-2020': 29144,
 'August-2020': 26924,
 'September-2020': 21450,
 'October-2020': 22533,
 'November-2020': 19245,
 'December-2020': 19225,
 'January-2021': 25236,
 'February-2021': 24207,
 'March-2021': 28083,
 'April-2021': 29003}

In [15]:
# test = {'_id': ['abc', 'abc', 'abc', 'abc'], 
#         'event_date': ['2020-04-01', '2020-10-20', '2020-10-21', '2021-04-29'], 
#         'usertype': ['C', 'C', 'C', 'C'], 
#         'first_tran': [True, False, False, False], 
#         'last_tran': [False, False, False, True]
#     }

# # Churn periods
# churn = {'A': 360, 'B': 360, 'C': 120, 'D': 260}

# df_test = pd.DataFrame(test)
# df_test['event_date'] = pd.to_datetime(df_test['event_date'])

# churn_period = datetime.time()
# for idx, row in df_test.iterrows():
#     if row['first_tran'] == True:
#         churn_period = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])
#         continue
    
#     if row['event_date'] < churn_period:
#         continue
#     else:
#         dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
#         returnUser["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1
#         churn_period = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])

# returnUser = {k:v for k,v in returnUser.items() if v != 0}
# dropOff = {k:v for k,v in dropOff.items() if v != 0}

# print(dropOff, returnUser)

In [16]:
# # Grouping
# df_a = df_sep.loc[df['usertype'] == 'A']
# df_b = df_sep.loc[df['usertype'] == 'B']
# df_c = df_sep.loc[df['usertype'] == 'C']
# df_d = df_sep.loc[df['usertype'] == 'D']

# # Data Summary
# display = pd.DataFrame({'usertype': ['A', 'B', 'C', 'D'], 'unique': [len(df_a['_id'].unique()), len(df_b['_id'].unique()), len(df_c['_id'].unique()), len(df_d['_id'].unique())], 'count': [len(df_a), len(df_b), len(df_c), len(df_d)]})
# display