In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

from statsmodels.stats.weightstats import ztest

# Youtube 

In [None]:
# Group A --> Treatment Group shown 2 ads per ad-break
# Group B --> Control Group shown only 1 ad per ad break
# Let us comapre mean watch-times per group
# H0: mu1 = mu2
# H1: mu1 != mu2

In [None]:
!wget "https://docs.google.com/uc?export=download&id=1Hl96n6BWdl3ruJgCo_gaAWEb0kEYg__H" -O ab_test_data.csv

In [None]:
ab_test_data = pd.read_csv("ab_test_data.csv")
ab_test_data.sample(100)

In [None]:
ab_test_data.shape

In [None]:
ab_test_data['customer_segmnt'].value_counts()

In [None]:
ab_test_data.describe()

In [None]:
ab_test_data["watch_time_hrs"].quantile(0.999)

In [None]:
ab_test_data["watch_time_hrs"].quantile(0.998)

In [None]:
q998 = ab_test_data["watch_time_hrs"].quantile(0.998)
ab_test_data_no_out = ab_test_data[ab_test_data["watch_time_hrs"] < q998]

In [None]:
sns.histplot(ab_test_data_no_out['watch_time_hrs'], bins=100)
plt.show()

In [None]:
ab_test_control_data = ab_test_data_no_out[ab_test_data_no_out["customer_segmnt"] == "control"]
ab_test_treatment_data = ab_test_data_no_out[ab_test_data_no_out["customer_segmnt"] == "treatment"]

In [None]:
ab_test_control_data.shape

In [None]:
ab_test_treatment_data.shape

In [None]:
diff_means = ab_test_control_data["watch_time_hrs"].mean() - ab_test_treatment_data["watch_time_hrs"].mean()
diff_means

## Z Test

In [None]:
ztest(ab_test_control_data["watch_time_hrs"], ab_test_treatment_data["watch_time_hrs"])

In [None]:
ztest(ab_test_control_data["watch_time_hrs"], ab_test_treatment_data["watch_time_hrs"], alternative="larger")

In [None]:
ztest(ab_test_control_data["watch_time_hrs"], ab_test_treatment_data["watch_time_hrs"], alternative="smaller")

## T Test

In [None]:
# dof = ab_test_control_data.shape[0] + ab_test_treatment_data.shape[0] - 2
# dof

In [None]:
diff_means = ab_test_control_data["watch_time_hrs"].mean() - ab_test_treatment_data["watch_time_hrs"].mean()
diff_means

In [None]:
stats.ttest_ind(ab_test_control_data["watch_time_hrs"], ab_test_treatment_data["watch_time_hrs"])

# Paired T-test: Problem solving

In [None]:
df_ps = pd.read_csv("problem_solving.csv")

In [None]:
df_ps.head(20)

In [None]:
df_ps.describe()

In [None]:
stats.ttest_rel(df_ps["test_1"], df_ps["test_2"], alternative="two-sided")

In [None]:
stats.ttest_rel(df_ps["test_1"], df_ps["test_2"], alternative="greater")

In [None]:
stats.ttest_rel(df_ps["test_1"], df_ps["test_2"], alternative="less")

We swap test 2 and test 1 below

In [None]:
stats.ttest_rel(df_ps["test_2"], df_ps["test_1"], alternative="two-sided")

In [None]:
stats.ttest_rel(df_ps["test_2"], df_ps["test_1"], alternative="greater")

## One sample t-test on difference

In [None]:
stats.ttest_1samp(df_ps["test_2"] - df_ps["test_1"], popmean=0)

In [None]:
stats.ttest_rel(df_ps["test_1"], df_ps["test_2"], alternative="greater")

In [None]:
stats.ttest_rel(df_ps["test_1"], df_ps["test_2"], alternative="less")

# KS Test

In [None]:
d1 = pd.read_csv("drug_1_recovery.csv")

In [None]:
d1.head()

In [None]:
d2 = pd.read_csv("drug_2_recovery.csv")

In [None]:
d2.head()

In [None]:
stats.ks_2samp(d1["drug_1"], d2["drug_2"])

In [None]:
plt.grid()
a = plt.hist(d1["drug_1"], bins=100, cumulative=True, label='CDF', density=True, histtype='step')
b = plt.hist(d2["drug_2"], bins=100, cumulative=True, label='CDF', density=True, histtype='step')
plt.show()