# Loading the Dataset

In [1]:
import pandas as pd
import numpy as np
import yaml

In [2]:
try:
    with open("../config2.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

In [3]:
df_profile = pd.read_csv(config['data']['raw']['file1'])
df_digital1 = pd.read_csv(config['data']['raw']['file2'])
df_digital2 = pd.read_csv(config['data']['raw']['file3'])
df_experiment  = pd.read_csv(config['data']['raw']['file4'])

In [4]:
df_digital = pd.concat([df_digital1, df_digital2]).reset_index(drop = True)

In [5]:
print("Client Profiles:")
display(df_profile.head())
print("Digital Footprint:")
display(df_digital.head())
print("Experiment Roster:")
display(df_experiment.head())

Client Profiles:


Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


Digital Footprint:


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


Experiment Roster:


Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


# Initial Exploration

### Client Profile Dataset

In [6]:
#Explore the client profile dataset
df_profile.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [7]:
#Retrieving the number of rows and columns in the dataframe
print("The initial client profile dataframe has", df_profile.shape[0], 'rows and', df_profile.shape[1], 'columns')

The initial client profile dataframe has 70609 rows and 9 columns


In [8]:
# Displaying the data types of each column in the dataframe
df_profile.dtypes

client_id             int64
clnt_tenure_yr      float64
clnt_tenure_mnth    float64
clnt_age            float64
gendr                object
num_accts           float64
bal                 float64
calls_6_mnth        float64
logons_6_mnth       float64
dtype: object

In [9]:
#Cleaning column names
clean_profile = df_profile.copy()
clean_profile = df_profile.rename(columns = lambda x: x.strip().lower())

clean_profile.columns = ['client_id', 'tenure_yr', 'tenure_mnth', 'age', 'gender', 'num_acc', 'balance', 'calls', 'log_ons']
clean_profile.head()

Unnamed: 0,client_id,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [10]:
#Using applymap() to convert all strings to lowercase
clean_profile = clean_profile.applymap(lambda x: x.lower() if isinstance(x, str) else x)
clean_profile

  clean_profile = clean_profile.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,client_id,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons
0,836976,6.0,73.0,60.5,u,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,u,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,u,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,m,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,f,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,u,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,m,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,f,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,m,3.0,475114.69,4.0,4.0


In [11]:
#Grouping gender into either 'm', 'f', or 'na'
def map_gender(value):
    if value in ['u', 'x']:
        return 'u'
    else:
        return value

clean_profile['gender'] = clean_profile['gender'].apply(map_gender)
clean_profile['gender'].unique()

array(['u', 'm', 'f', nan], dtype=object)

In [12]:
#Checking for null values
clean_profile.isna().any()

client_id      False
tenure_yr       True
tenure_mnth     True
age             True
gender          True
num_acc         True
balance         True
calls           True
log_ons         True
dtype: bool

In [13]:
#Dropping rows with any missing values
clean_profile.dropna(inplace = True)
clean_profile.head()

Unnamed: 0,client_id,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons
0,836976,6.0,73.0,60.5,u,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,u,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,u,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,m,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,f,2.0,103671.75,0.0,3.0


In [14]:
clean_profile.isna().sum()

client_id      0
tenure_yr      0
tenure_mnth    0
age            0
gender         0
num_acc        0
balance        0
calls          0
log_ons        0
dtype: int64

In [15]:
clean_profile.duplicated().sum()

0

In [16]:
#Save the clean dataset to CSV format
clean_profile.to_csv("client_profile.csv")

### Digital Footprint Dataset

In [17]:
#Explore the digital footprint dataset
df_digital.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [18]:
#Retrieving the number of rows and columns in the initial dataset
print("The initial digital footprint dataframe has", df_digital.shape[0], 'rows and', df_digital.shape[1], 'columns')

The initial digital footprint dataframe has 755405 rows and 5 columns


In [19]:
# Displaying the data types of each column in the dataframe
df_digital.dtypes

client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

In [20]:
#Checking for null values
df_digital.isna().any()

client_id       False
visitor_id      False
visit_id        False
process_step    False
date_time       False
dtype: bool

In [21]:
#Check for duplicates
df_digital.duplicated().sum()

10764

In [22]:
#Dropping the duplicated data and keeping first row
df_digital.drop_duplicates(inplace = True)

In [23]:
#Split the date_time column into two separate columns
df_digital['date'] = df_digital['date_time'].str[:10]
df_digital['time'] = df_digital['date_time'].str[11:]
display(df_digital.head())

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,date,time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,2017-04-17,15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,2017-04-17,15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,2017-04-17,15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,2017-04-17,15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,2017-04-17,15:18:04


In [24]:
df_digital['date_time'] = pd.to_datetime(df_digital['date_time'])
df_digital.dtypes

client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
date                    object
time                    object
dtype: object

In [25]:
#Save the clean dataset to CSV format
df_digital.to_csv("digital_footprint.csv")

### Experiment Roster

In [26]:
#Explore the experiment roster dataset
df_experiment.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


In [27]:
#Cleaning column names
clean_experiment = df_experiment.copy()
clean_experiment = df_experiment.rename(columns = lambda x: x.strip().lower())

In [28]:
#Using applymap() to convert all strings to lowercase
clean_experiment = clean_experiment.applymap(lambda x: x.lower() if isinstance(x, str) else x)
clean_experiment

  clean_experiment = clean_experiment.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,client_id,variation
0,9988021,test
1,8320017,test
2,4033851,control
3,1982004,test
4,9294070,control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


In [29]:
#Checking for null values
clean_experiment.isna().sum()

client_id        0
variation    20109
dtype: int64

In [30]:
#Dropping rows with missing values
clean_experiment.dropna(inplace = True)
clean_experiment.tail()

Unnamed: 0,client_id,variation
50495,393005,control
50496,2908510,control
50497,7230446,test
50498,5230357,test
50499,6334360,test


In [31]:
#Checking for null values
clean_experiment.isna().any()

client_id    False
variation    False
dtype: bool

In [32]:
#Check the variables under the column 'variable'
clean_experiment['variation'].unique()

array(['test', 'control'], dtype=object)

In [33]:
#Retrieving the number of rows and columns in the final dataset
print("The initial experiment roster dataframe has", clean_experiment.shape[0], 'rows and', clean_experiment.shape[1], 'columns')

The initial experiment roster dataframe has 50500 rows and 2 columns


In [34]:
#Save the clean dataset to CSV format
clean_experiment.to_csv("experiment_roster.csv")