# Loading the Dataset

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml
import math

%matplotlib inline

In [2]:
try:
    with open("../config2.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

In [3]:
df_profile = pd.read_csv(config['data']['raw']['file1'])
df_digital1 = pd.read_csv(config['data']['raw']['file2'])
df_digital2 = pd.read_csv(config['data']['raw']['file3'])
df_experiment  = pd.read_csv(config['data']['raw']['file4'])

In [4]:
df_digital = pd.concat([df_digital1, df_digital2]).reset_index(drop = True)
df_digital.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [5]:
df_profile.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [6]:
df_experiment.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


In [7]:
#Merge the 3 dataframes on 'client_id'
merged_df = pd.merge(df_digital, df_profile, on = 'client_id', how = 'outer')
merged_df = pd.merge(merged_df, df_experiment, on = 'client_id', how = 'outer')

# Initial Exploration

In [8]:
#Display first few rows of the dataset
merged_df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
0,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
1,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
3,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
4,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,


In [9]:
#Retrieving the number of rows and columns in the dataframe
print("The initial dataframe has", merged_df.shape[0], 'rows and', merged_df.shape[1], 'columns')

The initial dataframe has 755405 rows and 14 columns


In [10]:
# Displaying the data types of each column in the dataframe
merged_df.dtypes

client_id             int64
visitor_id           object
visit_id             object
process_step         object
date_time            object
clnt_tenure_yr      float64
clnt_tenure_mnth    float64
clnt_age            float64
gendr                object
num_accts           float64
bal                 float64
calls_6_mnth        float64
logons_6_mnth       float64
Variation            object
dtype: object

In [11]:
#Cleaning column names
clean_df = merged_df.copy()
clean_df = merged_df.rename(columns = lambda x: x.strip().lower())

clean_df.columns = ['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time', 'tenure_yr', 'tenure_mnth', 'age', 'gender', 'num_acc', 'balance', 'calls', 'log_ons', 'variation']
clean_df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons,variation
0,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
1,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
3,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,
4,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,21.0,262.0,47.5,M,2.0,501570.72,4.0,4.0,


In [22]:
#Using applymap() to convert all strings to lowercase
clean_df = clean_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
clean_df

  clean_df = clean_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons,variation
12,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
13,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
14,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
15,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
16,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755376,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755377,9999729,834634258_21862004160,870243567_56915814033_814203,step_1,2017-05-08 16:08:30,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755378,9999729,834634258_21862004160,870243567_56915814033_814203,start,2017-05-08 16:08:25,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755391,9999832,145538019_54444341400,472154369_16714624241_585315,step_1,2017-05-16 16:46:11,23.0,281.0,49.0,f,2.0,431887.61,1.0,4.0,test


In [15]:
#Checking for null values
print(clean_df.isna().any())
print("")
print("The columns with null values are:", clean_df.columns[clean_df.isna().any()].tolist())
print("")

client_id       False
visitor_id      False
visit_id        False
process_step    False
date_time       False
tenure_yr        True
tenure_mnth      True
age              True
gender           True
num_acc          True
balance          True
calls            True
log_ons          True
variation        True
dtype: bool

The columns with null values are: ['tenure_yr', 'tenure_mnth', 'age', 'gender', 'num_acc', 'balance', 'calls', 'log_ons', 'variation']



In [23]:
#Dropping rows with any missing values
clean_df.dropna(how = 'any', inplace = True)
clean_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,tenure_yr,tenure_mnth,age,gender,num_acc,balance,calls,log_ons,variation
12,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
13,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
14,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
15,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
16,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,3.0,46.0,29.5,u,2.0,25454.66,2.0,6.0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755376,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755377,9999729,834634258_21862004160,870243567_56915814033_814203,step_1,2017-05-08 16:08:30,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755378,9999729,834634258_21862004160,870243567_56915814033_814203,start,2017-05-08 16:08:25,10.0,124.0,31.0,f,3.0,107059.74,6.0,9.0,test
755391,9999832,145538019_54444341400,472154369_16714624241_585315,step_1,2017-05-16 16:46:11,23.0,281.0,49.0,f,2.0,431887.61,1.0,4.0,test


In [24]:
#Check to see if all null values have been removed from df
print(clean_df.isna().any())

client_id       False
visitor_id      False
visit_id        False
process_step    False
date_time       False
tenure_yr       False
tenure_mnth     False
age             False
gender          False
num_acc         False
balance         False
calls           False
log_ons         False
variation       False
dtype: bool


In [21]:
#Examine the number of unique values in each column of the df
clean_df.nunique()

client_id        50487
visitor_id       55994
visit_id         69183
process_step         5
date_time       283253
tenure_yr           53
tenure_mnth        469
age                158
gender               4
num_acc              7
balance          50311
calls                7
log_ons              7
variation            2
dtype: int64

In [25]:
#Examine the categories for gender
clean_df['gender'].unique()

array(['u', 'm', 'f', 'x'], dtype=object)

In [27]:
#Grouping gender into either 'm', 'f', or 'na'
def map_gender(value):
    if value in ['u', 'x']:
        return 'na'
    else:
        return value

clean_df['gender'] = clean_df['gender'].apply(map_gender)
clean_df['gender'].unique()

array(['na', 'm', 'f'], dtype=object)

In [31]:
clean_df['age'].unique()

array([29.5, 57.5, 51. , 36. , 48. , 22. , 54.5, 28.5, 42. , 46. , 45. ,
       58.5, 55. , 62.5, 46.5, 51.5, 26.5, 56. , 25. , 52. , 63. , 30. ,
       70.5, 48.5, 38.5, 58. , 50.5, 76. , 67. , 35.5, 64.5, 53. , 61.5,
       41.5, 33. , 44. , 50. , 55.5, 27. , 38. , 25.5, 34. , 40.5, 82. ,
       26. , 31.5, 60. , 53.5, 37. , 63.5, 27.5, 43. , 28. , 52.5, 19.5,
       36.5, 40. , 62. , 34.5, 69. , 37.5, 54. , 65. , 21. , 60.5, 74.5,
       68. , 61. , 30.5, 77.5, 41. , 43.5, 22.5, 75.5, 32. , 67.5, 47.5,
       49. , 23.5, 59.5, 33.5, 24. , 71. , 59. , 23. , 69.5, 66. , 49.5,
       20. , 32.5, 19. , 42.5, 57. , 65.5, 47. , 81.5, 31. , 44.5, 71.5,
       70. , 24.5, 45.5, 81. , 75. , 56.5, 86.5, 73. , 39.5, 73.5, 79.5,
       85. , 35. , 80. , 29. , 72.5, 78. , 18.5, 64. , 21.5, 66.5, 84. ,
       72. , 83. , 91. , 74. , 95.5, 82.5, 17.5, 76.5, 20.5, 68.5, 84.5,
       39. , 77. , 83.5, 78.5, 92. , 79. , 18. , 90. , 89. , 87.5, 80.5,
       86. , 93.5, 88. , 85.5, 88.5, 94. , 87. , 92

In [37]:
#Rounding age down to the lowest whole number
def round_down(num):
    return math.floor(num)

clean_df['age'] = clean_df['age'].apply(round_down)
clean_df['age'] = clean_df['age'].astype(int)
clean_df.dtypes

client_id         int64
visitor_id       object
visit_id         object
process_step     object
date_time        object
tenure_yr       float64
tenure_mnth     float64
age               int64
gender           object
num_acc         float64
balance         float64
calls           float64
log_ons         float64
variation        object
dtype: object