In [71]:
# Library
import pandas as pd
import numpy as np
import calendar
import copy
import datetime
from tqdm import tqdm

In [72]:
df = pd.read_csv('daily_transactions.csv', sep=',')
print(df.shape)
df.head(5)

(1210998, 3)


Unnamed: 0,_id,event_date,usertype
0,By1+rEy20nW/sgRehb+RSZe9VI8=,2021-02-10,A
1,x7lxSW+y6Kymzxf1yrriMXDNZWQ=,2021-02-10,A
2,BQ7feJAnc6ntg4PaOXShHLrd3xQ=,2021-02-24,A
3,xmc9CsW3Q7K9HK5+pczuRvKUCBk=,2020-12-08,A
4,Dt0y6YCKnC/mD9Sm1SkNkwsrgzY=,2020-06-17,A


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210998 entries, 0 to 1210997
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   _id         1210998 non-null  object
 1   event_date  1210998 non-null  object
 2   usertype    1210998 non-null  object
dtypes: object(3)
memory usage: 27.7+ MB


In [74]:
# Convert column from obj to datetime
df['event_date'] = pd.to_datetime(df['event_date'])

# Sort dataframe
df = df[df['event_date'] <= '2021-04-30']
df = df.sort_values(by=['_id', 'event_date'])

In [75]:
# Store result
months = list(calendar.month_name[1:])
years = [2020, 2021]
newUser = {}

# Create dict to store new user
for y in years:
    for m in months:
        newUser["{}-{}".format(m, y)] = 0

# Copy
dropOff = copy.deepcopy(newUser)
returnUser = copy.deepcopy(newUser)

In [76]:
# Convert column from obj to datetime
df['event_date'] = pd.to_datetime(df['event_date'])

# Sort dataframe and drop duplicate
df_ft = df.drop_duplicates(subset=['_id'], keep='first')
df_ft = df_ft.sort_values(by='event_date')
df_ft = df_ft[df_ft['event_date'] >= '2020-04-01']

# Add columns
df_ft['first_tran'] = True

# Count new user for each months
for _, row in tqdm(df_ft.iterrows()):
    newUser["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1

# New users
newUser = {k:v for k,v in newUser.items() if v != 0}
newUser

311386it [00:15, 19480.19it/s]


{'April-2020': 15117,
 'May-2020': 20295,
 'June-2020': 22949,
 'July-2020': 30152,
 'August-2020': 28019,
 'September-2020': 21693,
 'October-2020': 22854,
 'November-2020': 18875,
 'December-2020': 19071,
 'January-2021': 23389,
 'February-2021': 22676,
 'March-2021': 31369,
 'April-2021': 34927}

In [77]:
# Sort dataframe and drop duplicate
df_lt_churn = df.drop_duplicates(subset=['_id'], keep='last')
df_lt_churn = df_lt_churn.sort_values(by='event_date')
df_lt_store = df_lt_churn[df_lt_churn['event_date'] >= '2020-04-01']

# Churn periods
churn = {'A': 360, 'B': 360, 'C': 120, 'D': 260}

# Add the churn values to all of the row
for idx, row in tqdm(df_lt_churn.iterrows()):
    df_lt_churn.at[idx, 'event_date'] = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])

# Filter
df_lt_churn = df_lt_churn[(df_lt_churn['event_date'] >= '2020-04-01') & (df_lt_churn['event_date'] <= '2021-04-30')]

# Count drop off user for each months
for _, row in tqdm(df_lt_churn.iterrows()):
    dropOff["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1

# Old users
dropOff

432458it [00:56, 7711.75it/s]
131718it [00:13, 9793.70it/s] 


{'January-2020': 0,
 'February-2020': 0,
 'March-2020': 0,
 'April-2020': 675,
 'May-2020': 7646,
 'June-2020': 10327,
 'July-2020': 11909,
 'August-2020': 10768,
 'September-2020': 9886,
 'October-2020': 9867,
 'November-2020': 9457,
 'December-2020': 9058,
 'January-2021': 12710,
 'February-2021': 12347,
 'March-2021': 12049,
 'April-2021': 15019,
 'May-2021': 0,
 'June-2021': 0,
 'July-2021': 0,
 'August-2021': 0,
 'September-2021': 0,
 'October-2021': 0,
 'November-2021': 0,
 'December-2021': 0}

In [78]:
# Add columns
df_lt_store['last_tran'] = True

# Filter
df = df[(df['event_date'] >= '2020-04-01') & (df['event_date'] <= '2021-04-30')]

# Merge
df_merge = pd.merge(df_ft, df_lt_store, how="outer", on=['_id', 'event_date', 'usertype'])
df_merge = pd.merge(df_merge, df, how="outer", on=['_id', 'event_date', 'usertype'])
df_merge = df_merge.replace(np.nan, False)

# Remove value with single transaction (true & true)
df_merge = df_merge[(df_merge['first_tran'] != True) | (df_merge['last_tran'] != True)]
df_merge

Unnamed: 0,_id,event_date,usertype,first_tran,last_tran
0,R1bDR3tbxX/hvdNsLp4W+BHzvB0=,2020-04-01,A,True,False
1,hVisMGVBErtNtn7OcTG+LME0FZg=,2020-04-01,B,True,False
2,ZKqXtgPrXuRKTTcapdKt3KpUqvM=,2020-04-01,B,True,False
4,83hwxDOo1IgICTQtH7mSfjvEIlA=,2020-04-01,C,True,False
6,gwvESIj5VO+DCGJP2L3TapcCMDU=,2020-04-01,A,True,False
...,...,...,...,...,...
895423,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-02-26,A,False,False
895424,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-02-28,A,False,False
895425,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-03-01,A,False,False
895426,zzyMFNFuAwqAN1FMkTmCD7gs1fU=,2021-03-02,A,False,False


In [98]:
# Sort dataframe and drop duplicate
df_ft_churn = df.drop_duplicates(subset=['_id'], keep='first')
df_ft_churn = df_ft_churn.sort_values(by='event_date')
df_ft_churn['churn_date'] = datetime.time()

# Add the churn values to all of the row
for idx, row in tqdm(df_ft_churn.iterrows()):
    diff = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])
    df_ft_churn.at[idx, 'churn_date'] = diff.strftime("%Y-%m-%d")

# Filter
df_ft_churn = df_ft_churn[df_ft_churn['churn_date'] <= '2021-04-30']
df_ft_churn.sort_values(by='event_date')


326338it [00:24, 13536.67it/s]


Unnamed: 0,_id,event_date,usertype,churn_date
833417,q1/0IoEgxi/BmKwRHp2xy6T1JQM=,2020-04-01,B,2021-03-27
1053505,4h+NeisC36wf+wpTUmwH/PCvRd8=,2020-04-01,B,2021-03-27
799307,zmGmxnhpigxNAEsqn1bKFKiy7VI=,2020-04-01,B,2021-03-27
922534,/BFF0F9L6vZhUlsBW7mWgykToDU=,2020-04-01,B,2021-03-27
51526,91T/AvqW+P88nPYrtpQlUwr2byg=,2020-04-01,A,2021-03-27
...,...,...,...,...
1090793,CapK0GrKewGx5w6qU1F81xWkEFc=,2020-12-31,C,2021-04-30
1075536,17zvnJ0seby0cc81/YzOGiEiXDM=,2020-12-31,C,2021-04-30
1130066,Hi89xWQ2avfRSlblZ5/6MVf1Il0=,2020-12-31,C,2021-04-30
1066088,3UCkOcMQct2xFkGZWzt10E3rSAQ=,2020-12-31,C,2021-04-30


In [95]:
# Merge
common_ids = pd.merge(df_ft_churn, df_merge, how='inner', on=['_id', 'usertype'])
common_ids = common_ids[['_id', 'usertype']]
common_ids = common_ids.drop_duplicates(subset=['_id'], keep='first')

store_val = []
for _, row in tqdm(common_ids.iterrows()):
    df_temp = df_merge.loc[df_merge['_id'] == row['_id']]
    
    # Check if single
    if len(df_temp) <= 1:
        continue

    store_val.append({'_id': row['_id'], 'event_date': })

    
#     churn_period = row['event_date']
#     for i, r in df_temp.iterrows():
#         if r['event_date'] < churn_period:
#             continue
        
#         else:
#             dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
#             returnUser["{}-{}".format(calendar.month_name[r['event_date'].month], r['event_date'].year)] += 1

#             if r['last_tran'] == False:
#                 churn_period = r['event_date'] + datetime.timedelta(days=churn[r['usertype']])

# returnUser = {k:v for k,v in returnUser.items() if v != 0}
# returnUser

Unnamed: 0,_id,usertype
0,q1/0IoEgxi/BmKwRHp2xy6T1JQM=,B
1,v97fgYvhMqd4wEtaeiFJjVq0ILk=,A
12,8FWjBrNfpfO2x8lhvSmHKVoEhdk=,A
31,8FLRNrfurG4jpWlTzCXMGxmA7pk=,B
33,8FHyjjZzITzLdytXmkrrzT/rjF0=,B
...,...,...
742351,l2VzuHs1lPBkCR+QgVrezCHjQE0=,B
742352,HiLQ01kCZAIxSNfFHzvo3GsH5t8=,A
742353,dOUm2ZKRYyvv4/tNi6tJkgVknz4=,A
742354,xofvz2dFA/KyKcwewgVH2NeY4xM=,C


In [81]:
# test = {'_id': ['abc', 'abc', 'abc', 'abc'], 
#         'event_date': ['2020-04-01', '2020-10-20', '2020-10-21', '2021-04-29'], 
#         'usertype': ['C', 'C', 'C', 'C'], 
#         'first_tran': [True, False, False, False], 
#         'last_tran': [False, False, False, True]
#     }

# # Churn periods
# churn = {'A': 360, 'B': 360, 'C': 120, 'D': 260}

# df_test = pd.DataFrame(test)
# df_test['event_date'] = pd.to_datetime(df_test['event_date'])

# churn_period = datetime.time()
# for idx, row in df_test.iterrows():
#     if row['first_tran'] == True:
#         churn_period = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])
#         continue
    
#     if row['event_date'] < churn_period:
#         continue
#     else:
#         dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
#         returnUser["{}-{}".format(calendar.month_name[row['event_date'].month], row['event_date'].year)] += 1
#         churn_period = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])

# returnUser = {k:v for k,v in returnUser.items() if v != 0}
# dropOff = {k:v for k,v in dropOff.items() if v != 0}

# print(dropOff, returnUser)

In [82]:
# for idx, row in tqdm(df_ft_churn.iterrows()):
#     df_temp = df_merge.loc[df_merge['_id'] == row['_id']]

#     # Check if single
#     if len(df_temp) <= 1:
#         continue
    
#     churn_period = row['event_date']
#     for i, r in df_temp.iterrows():
#         if r['event_date'] < churn_period:
#             continue
        
#         else:
#             dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
#             returnUser["{}-{}".format(calendar.month_name[r['event_date'].month], r['event_date'].year)] += 1

#             if r['last_tran'] == False:
#                 churn_period = r['event_date'] + datetime.timedelta(days=churn[r['usertype']])

# returnUser = {k:v for k,v in returnUser.items() if v != 0}
# returnUser

In [83]:
# for idx, row in tqdm(df_ft.iterrows()):
#     df_temp = df_merge.loc[df_merge['_id'] == row['_id']]

#     # Check if single
#     if len(df_temp) == 1:
#         continue
    
#     # Calculate churn periods
#     churn_period = row['event_date'] + datetime.timedelta(days=churn[row['usertype']])

#     if churn_period.year in years:
#         for i, r in df_temp.iterrows():
#             if r['first_tran'] == True:
#                 continue

#             if r['event_date'] < churn_period:
#                 continue
            
#             else:
#                 dropOff["{}-{}".format(calendar.month_name[churn_period.month], churn_period.year)] += 1
#                 returnUser["{}-{}".format(calendar.month_name[r['event_date'].month], r['event_date'].year)] += 1

#                 if r['last_tran'] == False:
#                     churn_period = r['event_date'] + datetime.timedelta(days=churn[r['usertype']])

# returnUser = {k:v for k,v in returnUser.items() if v != 0}
# returnUser

In [84]:
# # Grouping
# df_a = df_sep.loc[df['usertype'] == 'A']
# df_b = df_sep.loc[df['usertype'] == 'B']
# df_c = df_sep.loc[df['usertype'] == 'C']
# df_d = df_sep.loc[df['usertype'] == 'D']

# # Data Summary
# display = pd.DataFrame({'usertype': ['A', 'B', 'C', 'D'], 'unique': [len(df_a['_id'].unique()), len(df_b['_id'].unique()), len(df_c['_id'].unique()), len(df_d['_id'].unique())], 'count': [len(df_a), len(df_b), len(df_c), len(df_d)]})
# display