# Hypothesis_Tests_and_IC using Bootstrapping

In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Specify relative path to the file
file_path = "../database/CreditCard_CLEANED.csv"

# Load data into a pandas DataFrame
df = pd.read_csv(file_path)


In [4]:
data=df

In [5]:
data["Total Charges"] = pd.to_numeric(data["Total Charges"], errors='coerce')

# H0: não existe diferença estística entre os valores dos clientes ativos e inativos quanto ao valor de cobrança mensal
# H1: existe diferença estatisticamente relevante

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for the plot
sns.set(style='whitegrid')

# Define the number of bootstrap iterations
n_boot = 1000

# Define the confidence interval
ci = 95

# Separate the data into Churn Label "Yes" and "No" groups
yes_data = data[data["Churn Label"] == "Yes"]["Monthly Charges"]
no_data = data[data["Churn Label"] == "No"]["Monthly Charges"]

# Bootstrap the means of each group
yes_means = [np.mean(np.random.choice(yes_data, len(yes_data), replace=True)) for _ in range(n_boot)]
no_means = [np.mean(np.random.choice(no_data, len(no_data), replace=True)) for _ in range(n_boot)]

# Calculate the confidence intervals for each group
yes_ci_low, yes_ci_high = np.percentile(yes_means, [(100-ci)/2, 100-(100-ci)/2])
no_ci_low, no_ci_high = np.percentile(no_means, [(100-ci)/2, 100-(100-ci)/2])

# Combine the bootstrapped means into a DataFrame
bootstrapped_means = pd.DataFrame({"Churn Label": ["Yes"] * n_boot + ["No"] * n_boot, "Monthly Charges": yes_means + no_means})

# Create a displot with the bootstrapped means and fill the area within the confidence interval
g = sns.displot(
    data=bootstrapped_means,
    x="Monthly Charges",
    hue="Churn Label",
    kind="kde",
    common_norm=False,
    fill=True,
)

# Add vertical lines for the confidence intervals
plt.axvline(yes_ci_low, color='blue', linestyle='--', alpha=0.7)
plt.axvline(yes_ci_high, color='blue', linestyle='--', alpha=0.7)
plt.axvline(no_ci_low, color='orange', linestyle='--', alpha=0.7)
plt.axvline(no_ci_high, color='orange', linestyle='--', alpha=0.7)

# Set the labels for the X and Y axes
g.set_axis_labels("Mean Monthly Charges", "Density")

# Set the title for the plot
plt.title("Bootstrap Distribution of Mean Monthly Charges by Churn Label")
# Create a DataFrame for the confidence intervals
print(f"Clientes Ativos: IC -> ({no_ci_low:.2f}, {no_ci_high:.2f})")
print(f"Clientes Cancelados: IC -> ({yes_ci_low:.2f}, {yes_ci_high:.2f})")


In [None]:
from scipy import stats

# Conduct the t-test
t_stat, p_value = stats.ttest_ind(yes_data, no_data)

# Print the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {format(p_value, '.50f')}")

# H0 rejeitada! (implicações discutidas no relatório)


QUANTO AO CLTV (Score interno da empresa que mede importancia de cada cliente):

# H0: não existe diferença estística entre os valores dos clientes ativos e inativos
# H1: existe diferença estatisticamente relevante

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for the plot
sns.set(style='whitegrid')

# Define the number of bootstrap iterations
n_boot = 1000

# Define the confidence interval
ci = 95

# Separate the data into Churn Label "Yes" and "No" groups
yes_data = data[data["Churn Label"] == "Yes"]["CLTV"]
no_data = data[data["Churn Label"] == "No"]["CLTV"]

# Bootstrap the means of each group
yes_means = [np.mean(np.random.choice(yes_data, len(yes_data), replace=True)) for _ in range(n_boot)]
no_means = [np.mean(np.random.choice(no_data, len(no_data), replace=True)) for _ in range(n_boot)]

# Calculate the confidence intervals for each group
yes_ci_low, yes_ci_high = np.percentile(yes_means, [(100-ci)/2, 100-(100-ci)/2])
no_ci_low, no_ci_high = np.percentile(no_means, [(100-ci)/2, 100-(100-ci)/2])

# Combine the bootstrapped means into a DataFrame
bootstrapped_means = pd.DataFrame({"Churn Label": ["Yes"] * n_boot + ["No"] * n_boot, "CLTV": yes_means + no_means})

# Create a displot with the bootstrapped means and fill the area within the confidence interval
g = sns.displot(
    data=bootstrapped_means,
    x="CLTV",
    hue="Churn Label",
    kind="kde",
    common_norm=False,
    fill=True,
)

# Add vertical lines for the confidence intervals
plt.axvline(yes_ci_low, color='blue', linestyle='--', alpha=0.7)
plt.axvline(yes_ci_high, color='blue', linestyle='--', alpha=0.7)
plt.axvline(no_ci_low, color='orange', linestyle='--', alpha=0.7)
plt.axvline(no_ci_high, color='orange', linestyle='--', alpha=0.7)

# Set the labels for the X and Y axes
g.set_axis_labels("Mean CLTV", "Density")

# Set the title for the plot
plt.title("Bootstrap Distribution of Mean CLTV by Churn Label")

# Show the plot
plt.show()



In [None]:
from scipy import stats

# Conduct the t-test
t_stat, p_value = stats.ttest_ind(yes_data, no_data)

# Print the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {format(p_value, '.50f')}")

# H0: não existe diferença estística entre os valores dos clientes ativos e inativos
# H1: existe diferença estatisticamente relevante

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for the plot
sns.set(style='whitegrid')

# Define the number of bootstrap iterations
n_boot = 1000

# Define the confidence interval
ci = 95

# Separate the data into Churn Label "Yes" and "No" groups
yes_data = data[data["Churn Label"] == "Yes"]["Churn Score"]
no_data = data[data["Churn Label"] == "No"]["Churn Score"]

# Bootstrap the means of each group
yes_means = [np.mean(np.random.choice(yes_data, len(yes_data), replace=True)) for _ in range(n_boot)]
no_means = [np.mean(np.random.choice(no_data, len(no_data), replace=True)) for _ in range(n_boot)]

# Calculate the confidence intervals for each group
yes_ci_low, yes_ci_high = np.percentile(yes_means, [(100-ci)/2, 100-(100-ci)/2])
no_ci_low, no_ci_high = np.percentile(no_means, [(100-ci)/2, 100-(100-ci)/2])

# Combine the bootstrapped means into a DataFrame
bootstrapped_means = pd.DataFrame({"Churn Label": ["Yes"] * n_boot + ["No"] * n_boot, "Churn Score": yes_means + no_means})

# Create a displot with the bootstrapped means and fill the area within the confidence interval
g = sns.displot(
    data=bootstrapped_means,
    x="Churn Score",
    hue="Churn Label",
    kind="kde",
    common_norm=False,
    fill=True,
)

# Add vertical lines for the confidence intervals
plt.axvline(yes_ci_low, color='blue', linestyle='--', alpha=0.7)
plt.axvline(yes_ci_high, color='blue', linestyle='--', alpha=0.7)
plt.axvline(no_ci_low, color='orange', linestyle='--', alpha=0.7)
plt.axvline(no_ci_high, color='orange', linestyle='--', alpha=0.7)

# Set the labels for the X and Y axes
g.set_axis_labels("Mean Churn Score", "Density")

# Set the title for the plot
plt.title("Bootstrap Distribution of Mean Churn Score by Churn Label")

# Show the plot
plt.show()
# Create a DataFrame for the confidence intervals
print(f"Clientes Ativos: IC -> ({no_ci_low:.2f}, {no_ci_high:.2f})")
print(f"Clientes Cancelados: IC -> ({yes_ci_low:.2f}, {yes_ci_high:.2f})")


# Test - t para obtermos a T-statistic e p-value:

In [None]:
from scipy import stats

# Conduct the t-test
t_stat, p_value = stats.ttest_ind(yes_data, no_data)

# Print the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {format(p_value, '.50f')}")

# H0 rejeitada! (implicações discutidas no relatório)


# Quanto a cobrança total:  

# H0: não existe diferença estística entre os valores dos clientes ativos e inativos
# H1: existe diferença estatisticamente relevante

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for the plot
sns.set(style='whitegrid')
# Convert "Total Charges" to numeric, invalid parsing will be set as NaN
data["Total Charges"] = pd.to_numeric(data["Total Charges"], errors='coerce')

# Drop rows with NaN values in "Total Charges"
data = data.dropna(subset=["Total Charges"])

# Define the number of bootstrap iterations
n_boot = 1000

# Define the confidence interval
ci = 95

# Separate the data into Churn Label "Yes" and "No" groups
yes_data = data[data["Churn Label"] == "Yes"]["Total Charges"]
no_data = data[data["Churn Label"] == "No"]["Total Charges"]

# Bootstrap the means of each group
yes_means = [np.mean(np.random.choice(yes_data, len(yes_data), replace=True)) for _ in range(n_boot)]
no_means = [np.mean(np.random.choice(no_data, len(no_data), replace=True)) for _ in range(n_boot)]

# Calculate the confidence intervals for each group
yes_ci_low, yes_ci_high = np.percentile(yes_means, [(100-ci)/2, 100-(100-ci)/2])
no_ci_low, no_ci_high = np.percentile(no_means, [(100-ci)/2, 100-(100-ci)/2])

# Combine the bootstrapped means into a DataFrame
bootstrapped_means = pd.DataFrame({"Churn Label": ["Yes"] * n_boot + ["No"] * n_boot, "Total Charges": yes_means + no_means})

# Create a displot with the bootstrapped means and fill the area within the confidence interval
g = sns.displot(
    data=bootstrapped_means,
    x="Total Charges",
    hue="Churn Label",
    kind="kde",
    common_norm=False,
    fill=True,
)

# Add vertical lines for the confidence intervals
plt.axvline(yes_ci_low, color='blue', linestyle='--', alpha=0.7)
plt.axvline(yes_ci_high, color='blue', linestyle='--', alpha=0.7)
plt.axvline(no_ci_low, color='orange', linestyle='--', alpha=0.7)
plt.axvline(no_ci_high, color='orange', linestyle='--', alpha=0.7)

# Set the labels for the X and Y axes
g.set_axis_labels("Mean Total Charges", "Density")

# Set the title for the plot
plt.title("Bootstrap Distribution of Mean Total Charges by Churn Label")

# Show the plo


# H0 rejeitada! (implicações discutidas no relatório)

# Test - t para obtermos a T-statistic e p-value:

In [None]:
from scipy import stats

# Conduct the t-test
t_stat, p_value = stats.ttest_ind(yes_data, no_data)

# Print the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {format(p_value, '.50f')}")



In [None]:
data=df[['Total Charges', 'Monthly Charges', 'CLTV', 'Churn Score', 'Churn Value']].copy()

In [None]:
data['time_estimation'] = data['Total Charges']/data['Monthly Charges']

In [None]:
data.corr()

In [None]:
oi= df[['Churn Score','Churn Value']].copy()

In [None]:
oi = df[['Churn Score','Churn Value']].copy()
ola=np.array(oi[oi['Churn Value'] == 1]['Churn Score'])


In [None]:
(ola.std()/np.sqrt(len(ola)))