In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [413]:
!unzip /content/drive/MyDrive/customer_act.zip

Archive:  /content/drive/MyDrive/customer_act.zip
  inflating: customer_login_history.csv  
  inflating: invoice_history.csv     
  inflating: static_data.csv         
  inflating: transfer_history.csv    


In [414]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import seaborn as sns
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

sns.set(font_scale=1)
figsize(10, 10)

In [415]:

# EXPLORE STATIC DATA 
df_static = pd.read_csv('/content/static_data.csv')
df_static = df_static.rename(columns={'Unnamed: 0': 'index'})
df_static.set_index('index', inplace=True)
df_static.drop('PARTY_ID.1', inplace=True, axis=1)
df_static = df_static.drop_duplicates()
df_static.loc[:, 'CREATED_ON'] = df_static.CREATED_ON.astype('datetime64')
df_static.loc[:, 'LAST_LOGIN'] = df_static.LAST_LOGIN.astype('datetime64')

# Replace the None values in COUNTY with the mode
df_static['COUNTY'].fillna(df_static.COUNTY.mode()[0], inplace=True)
df_static.groupby('COUNTY').agg(lambda x: x.mode()).REGION

# Replace the None values in REGION with the mode of its COUNTY
county_mode = df_static.groupby('COUNTY').agg(lambda x: x.mode()).REGION
df_static.loc[df_static.REGION.isnull(), 'REGION'] = county_mode[df_static.loc[df_static.REGION.isnull(), 'COUNTY'].values].values
df_static.loc[:, 'REGION'] = df_static.REGION.astype('int')

party_ids = df_static.PARTY_ID

df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.0,5-9 fő,Inbox,1,Pest
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,,2000-4999 fő,Inbox,1,Pest
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.0,1 fő,Inbox,1,Pest
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.0,3-4 fő,Inbox,2,Pest
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.0,5-9 fő,Inbox,1,Pest
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.0,2 fő,Inbox Extra Free (= Inbox Extra),1,Pest
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.0,0 fő,Inbox,1,Pest
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.0,3-4 fő,Inbox,6,Csongrád
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.0,3-4 fő,Promó,6,Csongrád


In [416]:
present_date = df_static.CREATED_ON.max()
first_date = df_static.CREATED_ON.min()
print(f'present date:{present_date} first date:{first_date}')

present date:2020-10-26 18:55:00 first date:1970-01-01 00:00:00


In [417]:
df_static.PACKAGE.value_counts()

Inbox Extra Free (= Inbox Extra)    9738
Inbox                               8159
Freemium                            2285
ExleX                               2202
S                                   1635
Hibernált                            942
M                                    699
Inbox Extra Standard                 108
Promó                                 66
Inbox Extra Pro                       22
Inbox Extra                           15
Name: PACKAGE, dtype: int64

In [418]:
df_static.SIZE.value_counts()

0 fő            4779
5-9 fő          3132
1 fő            3024
3-4 fő          2612
10-19 fő        2379
2 fő            1963
20-49 fő        1681
50-99 fő         563
100-149 fő       144
Ismeretlen        93
150-199 fő        66
200-249 fő        48
300-499 fő        34
500-999 fő        34
250-299 fő        16
1000-1999 fő      11
2000-4999 fő       5
5000 főtől         2
Name: SIZE, dtype: int64

In [419]:
print(f'missing value for the SIZE :{len(df_static[df_static.SIZE.isnull()])}')
print(f'missing value for the PACKAGE :{len(df_static[df_static.PACKAGE.isnull()])}')
print(f'missing value for the LOB_CODE :{len(df_static[df_static.LOB_CODE.isnull()])}')

missing value for the SIZE :5392
missing value for the PACKAGE :107
missing value for the LOB_CODE :6231


In [420]:
df_static['COUNTY']

index
0            Pest
1            Pest
2            Pest
3            Pest
4            Pest
           ...   
26052        Pest
26053        Pest
26054    Csongrád
26055    Csongrád
26056        Pest
Name: COUNTY, Length: 25978, dtype: object

In [421]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
#bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
#bridge_df = pd.DataFrame(bridge_types, columns=['COUNTY'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_static['COUNTY'] = labelencoder.fit_transform(df_static['COUNTY'])
df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.0,5-9 fő,Inbox,1,12
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,,2000-4999 fő,Inbox,1,12
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.0,1 fő,Inbox,1,12
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.0,3-4 fő,Inbox,2,12
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.0,5-9 fő,Inbox,1,12
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.0,2 fő,Inbox Extra Free (= Inbox Extra),1,12
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.0,0 fő,Inbox,1,12
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.0,3-4 fő,Inbox,6,4
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.0,3-4 fő,Promó,6,4


In [422]:
df_static['COUNTY'].head(5000)

index
0       12
1       12
2       12
3       12
4       12
        ..
5033    16
5034     5
5035     9
5036     8
5037    12
Name: COUNTY, Length: 5000, dtype: int64

In [425]:
df_static['REGION']

index
0        1
1        1
2        1
3        2
4        1
        ..
26052    1
26053    1
26054    6
26055    6
26056    1
Name: REGION, Length: 25978, dtype: int64

In [426]:
df_static['REGION'].head(5000)

index
0       1
1       1
2       1
3       2
4       1
       ..
5033    9
5034    8
5035    5
5036    3
5037    2
Name: REGION, Length: 5000, dtype: int64

In [427]:
df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.0,5-9 fő,Inbox,1,12
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,,2000-4999 fő,Inbox,1,12
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.0,1 fő,Inbox,1,12
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.0,3-4 fő,Inbox,2,12
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.0,5-9 fő,Inbox,1,12
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.0,2 fő,Inbox Extra Free (= Inbox Extra),1,12
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.0,0 fő,Inbox,1,12
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.0,3-4 fő,Inbox,6,4
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.0,3-4 fő,Promó,6,4


In [428]:
df_static["SIZE"] = df_static["SIZE"].astype('str')
print(df_static["SIZE"])

index
0              5-9 fő
1        2000-4999 fő
2                1 fő
3              3-4 fő
4              5-9 fő
             ...     
26052            2 fő
26053            0 fő
26054          3-4 fő
26055          3-4 fő
26056            0 fő
Name: SIZE, Length: 25978, dtype: object


In [429]:
df_static["SIZE"] = df_static["SIZE"].str.replace('fő','')
df_static['SIZE']

index
0              5-9 
1        2000-4999 
2                1 
3              3-4 
4              5-9 
            ...    
26052            2 
26053            0 
26054          3-4 
26055          3-4 
26056            0 
Name: SIZE, Length: 25978, dtype: object

In [430]:
df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.0,5-9,Inbox,1,12
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,,2000-4999,Inbox,1,12
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.0,1,Inbox,1,12
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.0,3-4,Inbox,2,12
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.0,5-9,Inbox,1,12
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.0,2,Inbox Extra Free (= Inbox Extra),1,12
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.0,0,Inbox,1,12
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.0,3-4,Inbox,6,4
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.0,3-4,Promó,6,4


In [431]:

df_static['SIZE']

index
0              5-9 
1        2000-4999 
2                1 
3              3-4 
4              5-9 
            ...    
26052            2 
26053            0 
26054          3-4 
26055          3-4 
26056            0 
Name: SIZE, Length: 25978, dtype: object

In [432]:
df_static['SIZE'] = df_static['SIZE'].str.split('-').str[1]
df_static['SIZE']

index
0           9 
1        4999 
2          NaN
3           4 
4           9 
         ...  
26052      NaN
26053      NaN
26054       4 
26055       4 
26056      NaN
Name: SIZE, Length: 25978, dtype: object

In [433]:
df_static['SIZE'].head(1000)

index
0          9 
1       4999 
2         NaN
3          4 
4          9 
        ...  
1004      49 
1005      NaN
1006       4 
1007      NaN
1008      49 
Name: SIZE, Length: 1000, dtype: object

In [434]:
df_static['SIZE'] = df_static['SIZE'].replace(np.nan,1)
df_static['SIZE']

index
0           9 
1        4999 
2            1
3           4 
4           9 
         ...  
26052        1
26053        1
26054       4 
26055       4 
26056        1
Name: SIZE, Length: 25978, dtype: object

In [435]:
df_static['SIZE'].astype(str).astype(int)

index
0           9
1        4999
2           1
3           4
4           9
         ... 
26052       1
26053       1
26054       4
26055       4
26056       1
Name: SIZE, Length: 25978, dtype: int64

In [436]:
df_static['SIZE'].head(25000)

index
0           9 
1        4999 
2            1
3           4 
4           9 
         ...  
25072        1
25073        1
25074        1
25075        1
25076        1
Name: SIZE, Length: 25000, dtype: object

In [437]:
df_static['PACKAGE']

index
0                                   Inbox
1                                   Inbox
2                                   Inbox
3                                   Inbox
4                                   Inbox
                       ...               
26052    Inbox Extra Free (= Inbox Extra)
26053                               Inbox
26054                               Inbox
26055                               Promó
26056    Inbox Extra Free (= Inbox Extra)
Name: PACKAGE, Length: 25978, dtype: object

In [438]:
print(f'missing value for the PACKAGE :{len(df_static[df_static.PACKAGE.isnull()])}')

missing value for the PACKAGE :107


In [439]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
#bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
#bridge_df = pd.DataFrame(bridge_types, columns=['REGION'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df_static['PACKAGE'] = labelencoder.fit_transform(df_static['PACKAGE'].astype(str))
df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.0,9,3,1,12
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,,4999,3,1,12
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.0,1,3,1,12
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.0,4,3,2,12
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.0,9,3,1,12
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.0,1,5,1,12
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.0,1,3,1,12
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.0,4,3,6,4
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.0,4,9,6,4


In [440]:
df_static['LOB_CODE'] 

index
0        49414.0
1            NaN
2        70224.0
3        81104.0
4        46514.0
          ...   
26052    45114.0
26053    68104.0
26054    56104.0
26055    47114.0
26056    47914.0
Name: LOB_CODE, Length: 25978, dtype: float64

In [441]:

mode = df_static['LOB_CODE'].mean()
mode

55174.165493492685

In [442]:

df_static['LOB_CODE'] = df_static['LOB_CODE'].replace(np.nan,mode)
df_static['LOB_CODE']

index
0        49414.000000
1        55174.165493
2        70224.000000
3        81104.000000
4        46514.000000
             ...     
26052    45114.000000
26053    68104.000000
26054    56104.000000
26055    47114.000000
26056    47914.000000
Name: LOB_CODE, Length: 25978, dtype: float64

In [443]:
print(f'missing value for the LOB_CODE :{len(df_static[df_static.LOB_CODE.isnull()])}')

missing value for the LOB_CODE :0


In [444]:
df_static

Unnamed: 0_level_0,PARTY_ID,CREATED_ON,LAST_LOGIN,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,233823894,2017-03-22 14:30:00,2017-03-22 14:30:54,49414.000000,9,3,1,12
1,233823896,2017-03-23 12:48:00,2017-03-23 12:48:31,55174.165493,4999,3,1,12
2,233823898,2017-03-23 13:46:00,2017-03-23 15:19:06,70224.000000,1,3,1,12
3,233823900,2017-03-27 06:40:00,2017-03-27 06:44:06,81104.000000,4,3,2,12
4,233823901,2017-03-27 06:43:00,2017-03-27 06:46:13,46514.000000,9,3,1,12
...,...,...,...,...,...,...,...,...
26052,684845716,2020-10-16 13:40:00,2020-10-19 06:44:16,45114.000000,1,5,1,12
26053,684845718,2020-10-16 15:58:00,2020-10-16 16:40:03,68104.000000,1,3,1,12
26054,685181166,2020-10-18 10:38:00,2020-10-18 10:38:37,56104.000000,4,3,6,4
26055,685181167,2020-10-18 10:40:00,2020-10-18 10:46:20,47114.000000,4,9,6,4


In [445]:
df_static =df_static.drop(columns=['CREATED_ON'])
df_static = df_static.drop(columns=['LAST_LOGIN'])
df_static

Unnamed: 0_level_0,PARTY_ID,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,233823894,49414.000000,9,3,1,12
1,233823896,55174.165493,4999,3,1,12
2,233823898,70224.000000,1,3,1,12
3,233823900,81104.000000,4,3,2,12
4,233823901,46514.000000,9,3,1,12
...,...,...,...,...,...,...
26052,684845716,45114.000000,1,5,1,12
26053,684845718,68104.000000,1,3,1,12
26054,685181166,56104.000000,4,3,6,4
26055,685181167,47114.000000,4,9,6,4


In [446]:
df_static['SIZE'] = df_static['SIZE'].astype(str).astype(int)

In [447]:
df_static['LOB_CODE'] = df_static['LOB_CODE'].astype(int)

In [448]:
df_static.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25978 entries, 0 to 26056
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   PARTY_ID  25978 non-null  int64
 1   LOB_CODE  25978 non-null  int64
 2   SIZE      25978 non-null  int64
 3   PACKAGE   25978 non-null  int64
 4   REGION    25978 non-null  int64
 5   COUNTY    25978 non-null  int64
dtypes: int64(6)
memory usage: 2.6 MB


In [449]:
df_static

Unnamed: 0_level_0,PARTY_ID,LOB_CODE,SIZE,PACKAGE,REGION,COUNTY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,233823894,49414,9,3,1,12
1,233823896,55174,4999,3,1,12
2,233823898,70224,1,3,1,12
3,233823900,81104,4,3,2,12
4,233823901,46514,9,3,1,12
...,...,...,...,...,...,...
26052,684845716,45114,1,5,1,12
26053,684845718,68104,1,3,1,12
26054,685181166,56104,4,3,6,4
26055,685181167,47114,4,9,6,4


In [450]:
df_static.to_csv('static_data_v1.csv', index=True)