# 0 - Preparations

In [2]:
from pathlib import Path
import pandas as pd

import plotly.express as px

import statistics

# Set the plotly theme
px.defaults.template = 'simple_white'

In [2]:
# Load the dataset
data = pd.read_csv('dataset_management/dataset_details_and_split.csv')
data.head(3)

Unnamed: 0,scan_id,patient_id,location,side,examination,quality,path,filesize_MB,min,max,mean,z,y,x,split
0,D0004037,A01-0253,Radius,R,Ex01,VGS3,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,40.3,-2215,5824,348,168,459,322,test
1,D0004036,A01-0253,Radius,L,Ex01,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,43.22,-1643,4902,356,168,452,355,test
2,D0004385,A01-0253,Radius,R,Ex02,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex02/D...,39.05,-2294,5437,379,168,458,312,test


In [3]:
data.shape

(1293, 15)

# 1 - Univariate Frequency Distribution
https://mciwing.github.io/statistics/univariate/Frequency/

In [4]:
data.columns

Index(['scan_id', 'patient_id', 'location', 'side', 'examination', 'quality',
       'path', 'filesize_MB', 'min', 'max', 'mean', 'z', 'y', 'x', 'split'],
      dtype='object')

| name | scale | note | distribution |
| - | - | - | - |
| scan_id | nominal | unique values |
| patient_id | nominal | unique values |
| location | nominal | | OK |
| side | nominal | | OK |
| examination | nominal |
| quality | ordinal | | OK |
| path | nominal | unique values |
| filesize_MB | numeric | | OK |
| min | numeric | |
| max | numeric | |
| mean | numeric | |
| z | numeric | |
| y | numeric | |
| x | numeric | |
| split | nominal | |

### Nominal

In [5]:
# create a barplot with the frequency of locations
location_freq = data['location'].value_counts().reset_index()
location_freq.columns = ['location', 'count']  # rename columns for clarity

fig = px.bar(
    location_freq,
    x='location',
    y='count',
    title='Frequency of locations',
    width=500,
    )
fig.update_xaxes(title_text='Location')
fig.update_yaxes(title_text='Count')

fig.show()

In [6]:
# create a pie chart with the frequency of sides
side_freq = data['side'].value_counts().reset_index()
side_freq.columns = ['side', 'count']  # rename columns for clarity

fig = px.pie(
    side_freq,
    names='side',
    values='count',
    title='Frequency of sides',
    width=500,
    )

fig.show()

### Ordinal

In [7]:
# create a barplot with the frequency of quality
quality_freq = data['quality'].value_counts().reset_index()
quality_freq.columns = ['quality', 'count']  # rename columns for clarity

fig = px.bar(
    quality_freq,
    x='quality',
    y='count',
    title='Frequency of quality',
    width=500,
    )

fig.show()

### Numeric

In [8]:
fig = px.histogram(
    data,
    x='filesize_MB',
    nbins=25,
    title='Distribution of file sizes for individual files',
    labels={'filesize_MB': 'Filesize / MB'},
    width=500,
)

fig.show()

In [9]:
fig = px.histogram(
    data,
    x='filesize_MB',
    nbins=25,
    facet_col='location',
    title='Distribution of file sizes for individual files',
    labels={'filesize_MB': 'Filesize / MB'},
    width=800,
)

fig.show()

> die verteilungen wurde bei den verschiedenen Datentypen ermittelt und dargestellt.

# 2 - Measure of Central Tendency
https://mciwing.github.io/statistics/univariate/CentralTend/

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293 entries, 0 to 1292
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   scan_id      1293 non-null   object 
 1   patient_id   1293 non-null   object 
 2   location     1293 non-null   object 
 3   side         1293 non-null   object 
 4   examination  1293 non-null   object 
 5   quality      1293 non-null   object 
 6   path         1293 non-null   object 
 7   filesize_MB  1293 non-null   float64
 8   min          1293 non-null   int64  
 9   max          1293 non-null   int64  
 10  mean         1293 non-null   int64  
 11  z            1293 non-null   int64  
 12  y            1293 non-null   int64  
 13  x            1293 non-null   int64  
 14  split        1211 non-null   object 
dtypes: float64(1), int64(6), object(8)
memory usage: 151.7+ KB


### Nominal

In [11]:
print('side')
print('----------------')

print(f'Mode: {data['side'].mode().values[0]}')

side
----------------
Mode: L


### Ordinal

In [12]:
print('quality')
print('----------------')

print(f'Mode:   {data['quality'].mode().values[0]}')
print(f'Median: {statistics.median(data['quality'])}')


quality
----------------
Mode:   VGS1
Median: VGS1


### Numeric

In [13]:
print('filesize_MB')
print('----------------')

print(f'Mode:   {round(data['filesize_MB'].mode().values[0], 2)}')
print(f'Median: {round(data['filesize_MB'].median(), 2)}')
print(f'Mean:   {round(data['filesize_MB'].mean(), 2)}')

filesize_MB
----------------
Mode:   48.62
Median: 80.47
Mean:   85.79


In [14]:
# create a function to calculate the mode, median and mean for a given column
def calculate_measure_central_tendency(data, column):
    print(column)
    print('----------------')

    print(f'Mode:   {round(data[column].mode().values[0], 2)}')
    print(f'Median: {round(statistics.median(data[column]), 2)}')
    print(f'Mean:   {round(data[column].mean(), 2)}')

calculate_measure_central_tendency(data, 'filesize_MB')

filesize_MB
----------------
Mode:   48.62
Median: 80.47
Mean:   85.79


In [15]:
calculate_measure_central_tendency(data, 'x')
print()
calculate_measure_central_tendency(data, 'y')

x
----------------
Mode:   370
Median: 534
Mean:   531.8

y
----------------
Mode:   604
Median: 569
Mean:   569.22


In [16]:
data[['x', 'y', 'z']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x,1293.0,531.802011,147.566523,246.0,400.0,534.0,656.0,983.0
y,1293.0,569.221964,64.864584,395.0,529.0,569.0,611.0,872.0
z,1293.0,167.999227,0.02781,167.0,168.0,168.0,168.0,168.0


> abhängig vom typ der daten wurden die measueres of central tendency berechnet.

# 3 - Measure of Dispersion
https://mciwing.github.io/statistics/univariate/Dispersion/

In [17]:
data_numeric = data.select_dtypes(include=['number'])
data_numeric.head(3)

Unnamed: 0,filesize_MB,min,max,mean,z,y,x
0,40.3,-2215,5824,348,168,459,322
1,43.22,-1643,4902,356,168,452,355
2,39.05,-2294,5437,379,168,458,312


### Range

In [18]:
# define a function to calculate the range for a given column
def calculate_range(data, column):
    return data[column].max() - data[column].min()

calculate_range(data, 'x')

737

In [19]:
# Create a DataFrame to store measures of dispersion
measure_dispersion_df = pd.DataFrame(index=data_numeric.columns)

# Calculate range for all numeric columns
measure_dispersion_df['range'] = data_numeric.apply(lambda x: calculate_range(data_numeric, x.name))
measure_dispersion_df

Unnamed: 0,range
filesize_MB,212.49
min,10051.0
max,11396.0
mean,931.0
z,1.0
y,477.0
x,737.0


### Interquartile Range

In [20]:
# define a function to calculate the interquartile range for a given column
def calculate_iqr(data, column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    return q3 - q1

calculate_iqr(data, 'x')

256.0

In [21]:
# calculate IQR for all numeric columns
measure_dispersion_df['iqr'] = data_numeric.apply(lambda x: calculate_iqr(data_numeric, x.name))
measure_dispersion_df

Unnamed: 0,range,iqr
filesize_MB,212.49,46.9
min,10051.0,525.0
max,11396.0,666.0
mean,931.0,140.0
z,1.0,0.0
y,477.0,82.0
x,737.0,256.0


### Variance

In [22]:
# define a function to calculate the variance for a given column
def calculate_variance(data, column, round_to=3):
    return round(data[column].var(), round_to)

calculate_variance(data, 'x')

21775.879

In [23]:
# calculate variance for all numeric columns
measure_dispersion_df['variance'] = data_numeric.apply(lambda x: calculate_variance(data_numeric, x.name))
measure_dispersion_df

Unnamed: 0,range,iqr,variance
filesize_MB,212.49,46.9,990.304
min,10051.0,525.0,355572.002
max,11396.0,666.0,649062.625
mean,931.0,140.0,10728.338
z,1.0,0.0,0.001
y,477.0,82.0,4207.414
x,737.0,256.0,21775.879


### Standard Deviation

In [24]:
# define a function to calculate the standard deviation for a given column
def calculate_standard_deviation(data, column, round_to=3):
    return round(data[column].std(), round_to)

calculate_standard_deviation(data, 'x')

147.567

In [25]:
# calculate standard deviation for all numeric columns
measure_dispersion_df['std_deviation'] = data_numeric.apply(lambda x: calculate_standard_deviation(data_numeric, x.name))
measure_dispersion_df

Unnamed: 0,range,iqr,variance,std_deviation
filesize_MB,212.49,46.9,990.304,31.469
min,10051.0,525.0,355572.002,596.299
max,11396.0,666.0,649062.625,805.644
mean,931.0,140.0,10728.338,103.578
z,1.0,0.0,0.001,0.028
y,477.0,82.0,4207.414,64.865
x,737.0,256.0,21775.879,147.567


### Coefficient of Variation

In [26]:
# define a function to calculate the coefficient of variation for a given column
def calculate_coefficient_of_variation(data, column, round_to=3):
    std_dev = data[column].std()
    mean = data[column].mean()
    return round(std_dev / mean, round_to)

calculate_coefficient_of_variation(data, 'min')

-0.23

In [27]:
# calculate coefficient of variation for all numeric columns
measure_dispersion_df['coeff_variation'] = data_numeric.apply(lambda x: calculate_coefficient_of_variation(data_numeric, x.name))
measure_dispersion_df

Unnamed: 0,range,iqr,variance,std_deviation,coeff_variation
filesize_MB,212.49,46.9,990.304,31.469,0.367
min,10051.0,525.0,355572.002,596.299,-0.23
max,11396.0,666.0,649062.625,805.644,0.145
mean,931.0,140.0,10728.338,103.578,0.244
z,1.0,0.0,0.001,0.028,0.0
y,477.0,82.0,4207.414,64.865,0.114
x,737.0,256.0,21775.879,147.567,0.277


> für alle numerischen daten wurden die measures of disperson ermittelt und in einem dataframe zusammengefasst. Es fällt auf, dass der coefficient of variation für 'min' negativ ist, da alle werte und damit auch der mittelwert negativ sind. es ist auch deutlich zu erkennen, dass der coefficient of variantions im gegensatz zu den anderen nicht vom wertebereich abhängig ist und damit einen besseren vergleich zwischen den verschiedenen spalten ermöglicht.

# 4 - Bivariate Frequency Distribution
https://mciwing.github.io/statistics/bivariate/Frequency/

### Histogram

In [28]:
data.head(3)

Unnamed: 0,scan_id,patient_id,location,side,examination,quality,path,filesize_MB,min,max,mean,z,y,x,split
0,D0004037,A01-0253,Radius,R,Ex01,VGS3,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,40.3,-2215,5824,348,168,459,322,test
1,D0004036,A01-0253,Radius,L,Ex01,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,43.22,-1643,4902,356,168,452,355,test
2,D0004385,A01-0253,Radius,R,Ex02,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex02/D...,39.05,-2294,5437,379,168,458,312,test


In [29]:
fig = px.density_heatmap(
    data,
    x='filesize_MB',
    y='location',
    title='Density heatmap of file sizes by location',
    color_continuous_scale='Inferno',
    histnorm='percent',
    width=500,
    )

fig.show()

> in der heatmap ist erkennbar, dass sich die verteilung der dateigröße, abhängig von der art des knochens deutlich unterscheidet. die dateigröße bei tibia ist deutlich erkennbar größer als jene vom radius.

In [30]:
fig = px.density_heatmap(
    data,
    x='x',
    y='y',
    nbinsx=25,
    nbinsy=25,
    title='Density heatmap of x and y sizes',
    color_continuous_scale='Inferno',
    histnorm='percent',
    width=500,
    )

fig.show()

> in der heatmap ist die abhängigkeit der x und y abmessung klar erkennbar. der zusammenhang wird in den nächsten kapiteln weiter untersucht.

### Crosstab

In [31]:
pd.crosstab(
    data['location'],
    data['quality'],
    margins=True,
    margins_name='Total',
    )

quality,VGS1,VGS2,VGS3,VGS5,Total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Radius,277,212,94,52,635
Tibia,455,113,60,30,658
Total,732,325,154,82,1293


In [32]:
pd.crosstab(
    data['location'], 
    data['quality'],
    margins=True,
    margins_name='Total',
    normalize='all',
    ).round(3)

quality,VGS1,VGS2,VGS3,VGS5,Total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Radius,0.214,0.164,0.073,0.04,0.491
Tibia,0.352,0.087,0.046,0.023,0.509
Total,0.566,0.251,0.119,0.063,1.0


### Conditional frequency

In [33]:
cond_frequency_df = pd.crosstab(
    data['location'], 
    data['quality'],
    margins=True,
    margins_name='Total',
    normalize='index',
    ).round(3)

# add a column with the total values for each row
cond_frequency_df['Total'] = cond_frequency_df.sum(axis=1)

cond_frequency_df

quality,VGS1,VGS2,VGS3,VGS5,Total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Radius,0.436,0.334,0.148,0.082,1.0
Tibia,0.691,0.172,0.091,0.046,1.0
Total,0.566,0.251,0.119,0.063,0.999


In [34]:
cond_frequency_df = pd.crosstab(
    data['location'], 
    data['quality'],
    margins=True,
    margins_name='Total',
    normalize='columns',
    ).round(3)

# add a row with the total values for each column
cond_frequency_df.loc['Total'] = cond_frequency_df.sum()

cond_frequency_df

quality,VGS1,VGS2,VGS3,VGS5,Total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Radius,0.378,0.652,0.61,0.634,0.491
Tibia,0.622,0.348,0.39,0.366,0.509
Total,1.0,1.0,1.0,1.0,1.0


> in den verschiedenen crosstabs ist erkennbar, dass es für den radius deutlich mehr bilder mit guter qualität (VGS1) gibt im vergleich zum tibia. da die qualität vom scan stark von der bewegung während der durchführung abhängig ist, könnte das daher kommen, dass das bein (tibia) besser fixiert werden kann als der arm (radius).

# 5 - Measure of Correlation
https://mciwing.github.io/statistics/bivariate/Correlation/

### Covariance

In [35]:
# define a function to calculate the covariance between two columns
def calculate_covariance(data, column1, column2, round_to=3):
    return round(data[column1].cov(data[column2]), round_to)

calculate_covariance(data, 'x', 'y')

5751.153

### Pearson Correlation Coefficient

In [36]:
# define a function to calculate the pearson correlation coefficient between two columns
def calculate_pearson_correlation(data, column1, column2, round_to=3):
    return round(data[column1].corr(data[column2], method='pearson'), round_to)

calculate_pearson_correlation(data, 'x', 'y')

0.601

### Spearman Correlation Coefficient

In [37]:
# define a function to calculate the spearman correlation coefficient between two columns
def calculate_spearman_correlation(data, column1, column2, round_to=3):
    return round(data[column1].corr(data[column2], method='spearman'), round_to)

calculate_spearman_correlation(data, 'x', 'y')

0.627

In [38]:
# define a function to calculate all three correlation coefficients between two columns
def calculate_correlation_coefficients(data, column1, column2):
    print(f'Covariance: {calculate_covariance(data, column1, column2)}')
    print(f'Pearson:    {calculate_pearson_correlation(data, column1, column2)}')
    print(f'Spearman:   {calculate_spearman_correlation(data, column1, column2)}')

calculate_correlation_coefficients(data, 'x', 'y')

Covariance: 5751.153
Pearson:    0.601
Spearman:   0.627


> die verschiedenen meassures of correlations werden berechnet und zu einer funktion zusammengefasst um diese für ein paar von numerischen variablen gleichzeitig zu berechnen.

### Scatter Plot

#### Number of voxels vs. filesize

In [39]:
# Calculate the number of voxels in each file
data['n_voxels'] = data['x'] * data['y'] * data['z']

data.head()

Unnamed: 0,scan_id,patient_id,location,side,examination,quality,path,filesize_MB,min,max,mean,z,y,x,split,n_voxels
0,D0004037,A01-0253,Radius,R,Ex01,VGS3,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,40.3,-2215,5824,348,168,459,322,test,24830064
1,D0004036,A01-0253,Radius,L,Ex01,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex01/D...,43.22,-1643,4902,356,168,452,355,test,26957280
2,D0004385,A01-0253,Radius,R,Ex02,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex02/D...,39.05,-2294,5437,379,168,458,312,test,24006528
3,D0004384,A01-0253,Radius,L,Ex02,VGS2,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex02/D...,37.42,-2285,5216,370,168,453,303,test,23059512
4,D0004790,A01-0253,Radius,L,Ex03,VGS1,/data/trainers/PPS_NPZs/Radius/A01-0253/Ex03/D...,39.12,-1983,4811,308,168,464,312,test,24321024


In [40]:
fig = px.scatter(
    data,
    x='filesize_MB',
    y='n_voxels',
    title='Filesize vs. number of voxels',
    labels={'filesize_MB': 'Filesize / MB', 'n_voxels': 'Number of voxels'},
    width=500,
)

fig.show()

In [47]:
calculate_correlation_coefficients(data, 'filesize_MB', 'n_voxels')

Covariance: 585043072.57
Pearson:    0.998
Spearman:   0.998


> Wie erwartet gibt es einen sehr deutlichen zusammenhang zwischen der Anzahl der Voxel und der Dateigröße. Der einzelne Ausreißer ist im Detail zu betrachten um zu untersuchen, ob die Abmessungen bzw. Dateigröße korrekt ermittelt wurden.

#### Minimum vs. maximum intensity

In [41]:
fig = px.scatter(
    data,
    x='min',
    y='max',
    facet_col='location',
    title='Minimum vs. maximum intensity values',
    labels={'min': 'Minimum intensity value', 'max': 'Maximum intensity value'},
    width=800,
)

fig.show()

In [48]:
calculate_correlation_coefficients(data, 'min', 'max')

Covariance: -397618.818
Pearson:    -0.828
Spearman:   -0.702


> Durch die Ausreiser ist der interessante Bereich nur schwer erkennbar. Für eine bessere Darstellung werden diese entfernt. Diese Bilder sollten getrennt betrachtet werden. In vorliegender Arbeit ist dies nicht möglich, da die Rohdaten nicht weitergegeben werden dürfen. Die Ausreiser werden deshalb für die weitere Berechnung ausgeschlossen.

In [49]:
data[data['max'] > 10000]['scan_id']

341     D0005448
344     D0004734
585     D0004405
1238    D0005851
1279    D0005851
Name: scan_id, dtype: object

In [50]:
# Filter out the outliers for better visualization
fig = px.scatter(
    data[data['max'] < 10000],
    x='min',
    y='max',
    facet_col='location',
    title='Minimum vs. maximum intensity values',
    labels={'min': 'Minimum intensity value', 'max': 'Maximum intensity value'},
    width=800,
)

fig.show()

In [51]:
calculate_correlation_coefficients(data[data['max'] < 10000], 'min', 'max')

Covariance: -142285.701
Pearson:    -0.707
Spearman:   -0.698


Es ist eine negative korreletion erkennbar, bedeutet, dass bilder mit einem höheren maximalen wert auch einen niederern minimalen wert haben uns somit eine insgesamt höhere spannweite. Zwischen radius und Tibia ist kein Unterschied erkennbar.

In [52]:
calculate_correlation_coefficients(data, 'min', 'max')

Covariance: -397618.818
Pearson:    -0.828
Spearman:   -0.702


> es ist erkennbar, dass der betrag der korrelation größer ist wenn die ausreiser nicht ausgeschlossen werden. dies scheint plausibel, da diese im scatterplott auf der linie liegen.

#### X-Dimension vs. Y-Dimension

In [45]:
px.scatter(
    data,
    x='x',
    y='y',
    color='location',
    opacity=0.2,
    trendline='ols',
    title='X vs. Y dimensions',
    labels={'x': 'x-size / voxel', 'y': 'Y-size / voxel'},
    width=500,
)

> bei radius ist die trendlinie steiler, dies bedeutet, dass kleine x-abmessungen größeren y-abmessungen zugeordnet sind, was wiederum einer rechteckigeren form der Bilder entspricht. Die Bilder der Tibia weißen eine ehr quardatische form auf, was durch die nahezu im 45° winkel verlaufende trendlinie ersichtlich ist.

In [58]:
print('Total dataset')
print('----------------')
calculate_correlation_coefficients(data, 'x', 'y')

Total dataset
----------------
Covariance: 5751.153
Pearson:    0.601
Spearman:   0.627


In [59]:
print('Radius')
print('----------------')
calculate_correlation_coefficients(data[data['location']=='Radius'], 'x', 'y')
print()
print('Tibia')
print('----------------')
calculate_correlation_coefficients(data[data['location']=='Tibia'], 'x', 'y')

Radius
----------------
Covariance: 2425.54
Pearson:    0.806
Spearman:   0.811

Tibia
----------------
Covariance: 4032.867
Pearson:    0.859
Spearman:   0.857


> es ist zu erkennen, dass die korrelation deutlich größer ist, wenn die knochen getrennt betrachtet werden. so wie es auch im scatterplot dargestellt ist.

# 6 - Probability
https://mciwing.github.io/statistics/probability/General/

# 7 - Sampling
https://mciwing.github.io/statistics/probability/Sampling/

In [1]:
data.head()

NameError: name 'data' is not defined

# 8 - Law of Large Numbers

Ab hier optional

# 9 - Central Limit Theorem

# 10 - T-Test

# 11 - Confidence Intervals

# 12 - ANOVA

# 13 - Linear Regression