In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

1. Dataset “Babyboom” (переменные Time of birth recorded on the 24-hour clock, Sex of the child (1 = girl, 2 = boy), Birth weight in grams, Number of minutes after midnight of each birth):
* Проверьте гипотезу, что средний вес девочек такой же, как вес мальчиков. 
* Проверьте гипотезу, что дисперсия веса девочек такая же, как и веса мальчиков. 
   

In [3]:
 # Определение ширины каждого поля
colspecs = [
    (0, 8),    # Time of birth
    (8, 16),   # Sex of the child
    (16, 24),  # Birth weight in grams
    (24, 32)   # Number of minutes after midnight of each birth
]

# Определение имен колонок
column_names = [
    "Time_of_birth", "Sex", "Birth_weight", "Minutes_after_midnight"
]

# Загрузка данных
data = pd.read_fwf('data/babyboom.dat.txt', colspecs=colspecs, header=None, names=column_names)

data

Unnamed: 0,Time_of_birth,Sex,Birth_weight,Minutes_after_midnight
0,5,1,3837,5
1,104,1,3334,64
2,118,2,3554,78
3,155,2,3838,115
4,257,2,3625,177
5,405,1,2208,245
6,407,1,1745,247
7,422,2,2846,262
8,431,2,3166,271
9,708,2,3520,428


In [4]:
from scipy import stats

# Разделение данных на девочек и мальчиков
girls_weights = data[data['Sex'] == 1]['Birth_weight']
boys_weights = data[data['Sex'] == 2]['Birth_weight']

# Проведение t-теста
t_stat, p_value = stats.ttest_ind(girls_weights, boys_weights)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -1.5228564442562815, P-value: 0.1352891891054555


In [5]:
# Sample data
girls_weights = data[data['Sex'] == 1]['Birth_weight']
boys_weights = data[data['Sex'] == 2]['Birth_weight']

# Calculate variances
var1 = np.var(girls_weights, ddof=1)
var2 = np.var(boys_weights, ddof=1)

# Compute F-statistic
f_stat = var1 / var2

# Degrees of freedom
df1 = len(girls_weights) - 1
df2 = len(boys_weights) - 1

# Calculate p-value
p_value = 2 * min(stats.f.cdf(f_stat, df1, df2), 1 - stats.f.cdf(f_stat, df1, df2))

print(f"F-statistic: {f_stat}")
print(f"P-value: {p_value}")

F-statistic: 2.1771042882107263
P-value: 0.07526261914285004


___
2. Dataset “Euroweight” (переменные weight, batch):
* Проверить гипотезы о том, что среднее значение веса монеты одинаково в разных пакетах (попарно и все вместе). 




In [6]:
# Определение имен колонок
column_names = [
    "ID", "Weight", "Batch"
]

# Загрузка данных из переменной lines
data = pd.read_csv('data/euroweight.dat.txt', sep='\t', header=None, names=column_names)

data

Unnamed: 0,ID,Weight,Batch
0,1,7.512,1
1,2,7.502,1
2,3,7.461,1
3,4,7.562,1
4,5,7.528,1
...,...,...,...
1995,1996,7.514,8
1996,1997,7.519,8
1997,1998,7.606,8
1998,1999,7.547,8


In [7]:
from itertools import combinations
from scipy.stats import ttest_ind

# Get unique batches
batches = data['Batch'].unique()

# Perform pairwise t-tests
results = {}
for batch1, batch2 in combinations(batches, 2):
    weights1 = data[data['Batch'] == batch1]['Weight']
    weights2 = data[data['Batch'] == batch2]['Weight']
    t_stat, p_value = ttest_ind(weights1, weights2)
    results[(batch1, batch2)] = (t_stat, p_value)

# Print results
for (batch1, batch2), (t_stat, p_value) in results.items():
    print(f"Batch {batch1} vs Batch {batch2}: T-statistic = {t_stat}, P-value = {p_value}")

Batch 1 vs Batch 2: T-statistic = -1.1241810509353891, P-value = 0.26147780679017946
Batch 1 vs Batch 3: T-statistic = 3.1644998631182344, P-value = 0.001648507542831034
Batch 1 vs Batch 4: T-statistic = -4.0016907491459435, P-value = 7.245319251258257e-05
Batch 1 vs Batch 5: T-statistic = -4.0914574540531685, P-value = 4.9993849166524045e-05
Batch 1 vs Batch 6: T-statistic = 1.4565887588620556, P-value = 0.1458601194945584
Batch 1 vs Batch 7: T-statistic = -1.115153466577227, P-value = 0.26532250649811384
Batch 1 vs Batch 8: T-statistic = 0.9226820370831669, P-value = 0.3566197335516895
Batch 2 vs Batch 3: T-statistic = 4.199462222751052, P-value = 3.169999483351773e-05
Batch 2 vs Batch 4: T-statistic = -2.7223104844542156, P-value = 0.006710009918197492
Batch 2 vs Batch 5: T-statistic = -2.814325820935377, P-value = 0.00508142614345485
Batch 2 vs Batch 6: T-statistic = 2.5714317115838488, P-value = 0.010416963435836283
Batch 2 vs Batch 7: T-statistic = 0.04959673381902426, P-value = 

In [8]:
from scipy.stats import friedmanchisquare

# Prepare data for the Friedman test
weights_by_batch = [data[data['Batch'] == batch]['Weight'].values for batch in batches]

# Perform the Friedman test
stat, p_value = friedmanchisquare(*weights_by_batch)

print(f"Friedman test statistic: {stat}, P-value: {p_value}")

Friedman test statistic: 91.82516113630935, P-value: 5.217220021541185e-17


___
3. Dataset “iris.txt” (прочитайте описание данных в файле «iris_description.txt», переменные sepal length, sepal width, petal length, petal width, class):
* Проверить гипотезы о равенстве распределений характеристик цветков разных типов. 
* Проверьте гипотезы о равенстве средних и дисперсий различных характеристик цветов разных типов.

In [9]:
data = pd.read_csv('data/iris.txt', sep=',', header=None, names=["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Class"])

data

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [10]:
from scipy.stats import kruskal

# Разделение данных по классам
setosa = data[data['Class'] == 'Iris-setosa']
versicolor = data[data['Class'] == 'Iris-versicolor']
virginica = data[data['Class'] == 'Iris-virginica']

# Проверка гипотез о равенстве распределений для каждой характеристики
for feature in ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']:
    stat, p_value = kruskal(setosa[feature], versicolor[feature], virginica[feature])
    print(f"{feature}: Kruskal-Wallis H-statistic = {stat}, P-value = {p_value}")

SepalLength: Kruskal-Wallis H-statistic = 96.93743600064833, P-value = 8.91873433246198e-22
SepalWidth: Kruskal-Wallis H-statistic = 62.49463010053111, P-value = 2.6882119006774528e-14
PetalLength: Kruskal-Wallis H-statistic = 130.41406912893967, P-value = 4.7967237479455454e-29
PetalWidth: Kruskal-Wallis H-statistic = 131.09335318823713, P-value = 3.415387999117524e-29
