In [30]:
import pandas as pd
import scipy.stats as sps

In [31]:
def ztest_1sample(pop_mean, sample_mean, sample_std, sample_size, alternative='two-sided'):
    statistic = (sample_mean - pop_mean)/(sample_std / sample_size**0.5)
    if alternative == 'less':
        pvalue = sps.norm.cdf(statistic)
    if alternative == 'greater':
        pvalue = 1 - sps.norm.cdf(statistic)
    if alternative == 'two-sided':
        pvalue = 2 * (1 - sps.norm.cdf(abs(statistic)))
    return statistic, pvalue

def ztest_1sample_data(pop_mean, sample, alternative='two-sided'):
    return ztest_1sample(pop_mean, sample_mean=sample.mean(),
                         sample_std=sample.std(), sample_size=len(sample),
                         alternative=alternative)

In [32]:
def ztest_2sample(sample1_mean, sample2_mean, sample1_std, sample2_std, sample1_size, sample2_size, alternative='two-sided'):
    statistic = (sample1_mean - sample2_mean) / (sample1_std ** 2 / sample1_size + sample2_std ** 2 / sample2_size) ** 0.5
    if alternative == 'less':
        pvalue = sps.norm.cdf(statistic)
    if alternative == 'greater':
        pvalue = 1 - sps.norm.cdf(statistic)
    if alternative == 'two-sided':
        pvalue = 2 * (1 - sps.norm.cdf(abs(statistic)))
    return statistic, pvalue

def ztest_2sample_data(sample1, sample2, alternative='two-sided'):
    return ztest_2sample(
        sample1_mean=sample1.mean(), sample2_mean=sample2.mean(),
        sample1_std=sample1.std(), sample2_std=sample2.std(),
        sample1_size=len(sample1), sample2_size=len(sample2),
        alternative=alternative
    )

In [33]:
df = pd.read_csv('https://raw.githubusercontent.com/zalig/cu-datasets/main/golden_mine.csv')
df.replace(["FEMALE", "Female"], "female", inplace=True)
df = df[((df['Profit'] > -20000) & (df['Profit'] < 110000))]
df.head()

Unnamed: 0,Payments,Costs,Profit,Age,Gender,AgeGroup
0,9400,8100,1300,19,male,18-24
1,26600,13900,12700,29,male,25-39
2,9750,6750,3000,20,male,18-24
3,29700,11200,18500,18,female,18-24
4,49700,10450,39250,21,female,18-24


Blue

Задача 1

In [34]:
result = pd.DataFrame({
    'Количество клиентов': [df['Profit'].count()],
    'Средняя прибыль': [df['Profit'].mean()]
})
result

Unnamed: 0,Количество клиентов,Средняя прибыль
0,8713,15755.095834


In [35]:
ztest_1sample_data(0, df['Profit'] - 15000, alternative='greater')

(3.5021147923588165, 0.00023079035103212853)

p-value < 0.05 => отвергаем $H_0$ => есть статистически значимая выгода

Задача 2

In [36]:
male_df = df[(df['Gender'] == 'male') & (df['AgeGroup'] == '18-24')]
female_df = df[(df['Gender'] == 'female') & (df['AgeGroup'] == '18-24')]

In [42]:
result = pd.DataFrame({
    'male' : [male_df['Profit'].mean()],
    'female' : [female_df['Profit'].mean()]
})

result.style.format(precision=2).set_caption('Средняя прибыль с клиента по полу, до 24 лет')

Unnamed: 0,male,female
0,13155.76,26278.37


$H_0: \mu_1 = \mu_2$

$H_1: \mu_1 \neq \mu_2$

In [38]:
ztest_2sample_data(sample1=male_df['Profit'], sample2=female_df['Profit'], alternative='two-sided')

(-27.09280730417151, 0.0)

p-value < 0.05 => отвергаем $H_0$ => разница в прибыльности между этими двумя категориями статистически значима.

с таргетом:

$H_0: \mu = 0$

$H_1: \mu > 0$

In [39]:
stat_male, p_value_male = ztest_1sample_data(0, male_df['Profit'] - 15000, alternative='greater')
stat_male, p_value_male

(-4.065077065970156, 0.9999759917034207)

In [40]:
stat_female, p_value_female =ztest_1sample_data(0, female_df['Profit'] - 15000, alternative='greater')
stat_female, p_value_female

(66.48342590449387, 0.0)

In [41]:
result = pd.DataFrame({
    'male' : [p_value_male],
    'female' : [p_value_female]
})

result.style.format(precision=2).set_caption('P-value значимости прибыльности по полу для людей до 24 лет')

Unnamed: 0,male,female
0,1.0,0.0
