In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
from datetime import datetime
from scipy.stats import ttest_ind

%matplotlib inline

In [24]:
# Dataset Successful cases:
df_completed = pd.read_csv('../data/clean/df_completed.csv', sep=',')
df_completed.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,confirm_reached
0,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7,88,23.5,M,2,26436.73,6,9,0
1,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7,88,23.5,M,2,26436.73,6,9,0
2,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7,88,23.5,M,2,26436.73,6,9,0
3,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,Test,7,88,23.5,M,2,26436.73,6,9,0
4,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,Test,7,88,23.5,M,2,26436.73,6,9,0


In [25]:
df_completed.shape

(196645, 15)

In [26]:
# Dataset Failed Cases:
df_incomplete = pd.read_csv('../data/clean/df_incomplete.csv', sep=',')
df_incomplete.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,105007,676020267_36602664238,100030127_47967100085_936361,start,2017-03-22 11:07:49,Control,9,118,35.0,F,2,34897.47,3,6
1,5623007,700426174_33289797318,100037962_47432393712_705583,start,2017-04-14 16:41:51,Control,16,202,78.0,M,2,146827.14,5,8
2,5623007,700426174_33289797318,100037962_47432393712_705583,start,2017-04-14 16:43:16,Control,16,202,78.0,M,2,146827.14,5,8
3,5623007,700426174_33289797318,100037962_47432393712_705583,step_1,2017-04-14 16:43:55,Control,16,202,78.0,M,2,146827.14,5,8
4,5623007,700426174_33289797318,100037962_47432393712_705583,start,2017-04-14 16:44:03,Control,16,202,78.0,M,2,146827.14,5,8


In [27]:
df_incomplete.shape

(106703, 14)

In [34]:
# Step 1: Define the numeric columns to test
numeric_columns = ['clnt_age', 'clnt_tenure_yr', 'clnt_tenure_mnth', 'bal', 'calls_6_mnth', 'logons_6_mnth']

# Step 2: Perform the t-test for each numeric column
results = {}

for col in numeric_columns:
    # Extract the data from each group
    successful_data = df_completed[col]
    failed_data = df_incomplete[col]
    
    # Perform the t-test
    t_stat, p_value = ttest_ind(successful_data, failed_data, equal_var=False)  # assuming unequal variances
    
    # Store the results
    results[col] = {'t-statistic': t_stat, 'p-value': p_value}

# Step 3: Convert results to a DataFrame for easier interpretation
results_df = pd.DataFrame(results).T

# Display the t-test results
print(results_df)

                  t-statistic        p-value
clnt_age                  NaN            NaN
clnt_tenure_yr     -18.476841   3.637921e-76
clnt_tenure_mnth   -18.755414   2.022476e-78
bal                       NaN            NaN
calls_6_mnth       -23.238819  2.586507e-119
logons_6_mnth      -24.526819  1.160935e-132


## Significant Differences (Very low p-values)

The p-values are extremely small (e.g., 3.63e-76, 1.16e-132, etc.), which means they are well below 0.05. This indicates that the differences between the successful and failed groups for these variables are statistically significant.

#### clnt_tenure_yr__ and __clnt_tenure_mnth:

- t-statistic: -18.47 for clnt_tenure_yr and -18.76 for clnt_tenure_mnth.
- p-value: Extremely small (3.63e-76 for clnt_tenure_yr and 2.02e-78 for clnt_tenure_mnth), which means these results are highly significant.

__Interpretation:__ The negative t-statistic indicates that the mean tenure for users in the successful group is significantly lower than in the failed group. This suggests that users who fail to complete the process tend to have been with Vanguard for a longer period.

#### calls_6_mnth:

- t-statistic: -23.24
- p-value: 2.59e-119 (extremely significant).

__Interpretation:__ The negative t-statistic suggests that users in the successful group have made significantly fewer calls in the past six months compared to the failed group. This might imply that users who reach out more often are having difficulties or friction during the process, leading to abandonment.


#### logons_6_mnth:

- t-statistic: -24.53
- p-value: 1.16e-132 (extremely significant).

__Interpretation__: The negative t-statistic indicates that users in the successful group log in less frequently than those in the failed group. This could suggest that users who log in more frequently are struggling more with the process, which could explain the higher failure rate for them.