# Probability and statistics subject (CTU in Prague)

## Task 1 - Basic analysis of a dataset

In [2]:
import pandas as pd

pd.options.plotting.backend = "plotly"

data = pd.read_csv("packing_machine.csv", header=None, names=["Value"])
print(data)

       Value
0   24.52586
1   24.17119
2   24.54486
3   24.44240
4   23.93455
5   24.20389
6   24.19974
7   24.34851
8   23.94024
9   24.21022
10  24.87474
11  25.06155
12  25.48924
13  25.32572
14  23.71721
15  24.61622
16  25.06676
17  24.90055
18  24.36213
19  24.98580
20  24.80591
21  24.20853
22  24.72623
23  24.64437
24  24.70405
25  23.97645
26  25.29837
27  24.46910
28  24.99453
29  25.42994
30  24.66147
31  24.75773
32  25.03970
33  24.44901
34  25.13285
35  24.40205
36  24.78721
37  23.83656
38  24.17186
39  23.65390
40  24.48244
41  24.68550
42  24.22988
43  23.83956
44  24.09777
45  24.52098
46  24.89240
47  24.25332
48  24.14259
49  25.12906


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Value   50 non-null     float64
dtypes: float64(1)
memory usage: 532.0 bytes


In [4]:
data.describe()

Unnamed: 0,Value
count,50.0
mean,24.546894
std,0.458528
min,23.6539
25%,24.20505
50%,24.52342
75%,24.887985
max,25.48924


In [5]:
print(f"Min: {data['Value'].min():.2f}")
print(f"1st Quartile: {data['Value'].quantile(0.25):.2f}")
print(f"Median: {data['Value'].median():.2f}")
print(f"Mean: {data['Value'].mean():.2f}")
print(f"3rd Quartile: {data['Value'].quantile(0.75):.2f}")
print(f"Max: {data['Value'].max():.2f}")

Min: 23.65
1st Quartile: 24.21
Median: 24.52
Mean: 24.55
3rd Quartile: 24.89
Max: 25.49


In [6]:
print(f"Skewness: {data['Value'].skew()}")
print(f"Kurtosis: {data['Value'].kurtosis()}")

Skewness: 0.09593675056378767
Kurtosis: -0.6732486351795197


In [7]:
data.plot.scatter(x=data.index, y="Value", title="Scatter Plot of Data Values")

In [8]:
data.plot.box(y="Value", title="Box Plot of Data Values")

In [9]:
data.plot.hist(title="Histogram of Data Values")

In [10]:
import numpy as np
import scipy.stats as stats
import plotly.graph_objects as go
import plotly.express as px

# Prepare data for analysis
x = data["Value"]
mu, std = x.mean(), x.std()
x_range = np.linspace(x.min(), x.max(), 100)

In [11]:
# Histogram with Density and Normal Distribution
fig = go.Figure()

# Histogram (normalized to probability density)
fig.add_trace(
    go.Histogram(
        x=x,
        histnorm="probability density",
        name="Histogram",
        marker_color="yellow",
        nbinsx=8,
        opacity=0.7,
    )
)

# Normal Distribution Curve
pdf = stats.norm.pdf(x_range, mu, std)
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=pdf,
        mode="lines",
        name="Normal Distribution",
        line=dict(color="red", width=2),
    )
)

# Kernel Density Estimate (KDE)
kde = stats.gaussian_kde(x)
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=kde(x_range),
        mode="lines",
        name="Density",
        line=dict(color="black"),
    )
)

fig.update_layout(
    title="Coffee Packing Machine",
    xaxis_title="Package Weight in Decagrams",
    yaxis_title="Density",
)
fig.show()

In [12]:
# Empirical Distribution Function (ECDF)
fig = px.ecdf(data, x="Value", title="Empirical Distribution Function")

# Theoretical Normal CDF
cdf = stats.norm.cdf(x_range, mu, std)
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=cdf,
        mode="lines",
        name="Normal CDF",
        line=dict(color="red", width=2),
    )
)

fig.update_layout(yaxis_title="Probability")
fig.show()

In [13]:
# Q-Q Plot
qq = stats.probplot(x, dist="norm")
theoretical_quantiles = qq[0][0]
sample_quantiles = qq[0][1]
slope = qq[1][0]
intercept = qq[1][1]

fig = go.Figure()
fig.add_trace(
    go.Scatter(x=theoretical_quantiles, y=sample_quantiles, mode="markers", name="Data")
)

# Reference line
line_x = np.array([min(theoretical_quantiles), max(theoretical_quantiles)])
line_y = slope * line_x + intercept
fig.add_trace(
    go.Scatter(
        x=line_x, y=line_y, mode="lines", name="Normal Line", line=dict(color="red")
    )
)

fig.update_layout(
    title="Q-Q Plot (Normal)",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
)
fig.show()

In [14]:
# Normality Tests
print("Shapiro-Wilk normality test")
shapiro_stat, shapiro_p = stats.shapiro(x)
print(f"data: x, W = {shapiro_stat:.5f}, p-value = {shapiro_p:.4f}")

print("\nAnderson-Darling normality test")
ad_result = stats.anderson(x, dist="norm")
print(f"data: x, A = {ad_result.statistic:.5f}")
print("Critical values:", ad_result.critical_values)
print("Significance levels:", ad_result.significance_level)

print("\nJarque-Bera Normality Test")
jb_stat, jb_p = stats.jarque_bera(x)
print(f"data: x, JB = {jb_stat:.4f}, p-value = {jb_p:.4f}")

Shapiro-Wilk normality test
data: x, W = 0.98406, p-value = 0.7307

Anderson-Darling normality test
data: x, A = 0.19883
Critical values: [0.538 0.613 0.736 0.858 1.021]
Significance levels: [15.  10.   5.   2.5  1. ]

Jarque-Bera Normality Test
data: x, JB = 1.1685, p-value = 0.5575
