In [18]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from datetime import timezone
import numpy as np
import plotly.express as px
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

## 1. Task a

Load the 'MNIST' dataset using the python methods from scikit-learn (package: fetch_mldata) or use the parquet file from Ilias.

Note: mldata.org is not existing anymore, so the fetch_mldata function was removed in version v0.22, so 'fetch_openml' will be used.
 see https://github.com/scikit-learn/scikit-learn/commit/af4247b152350b4fd0ac8bb9395833bd84e827d2#diff-a87b98b2e7985785e06076c570467543


In [19]:
def load_from_local():
    path = './datasets/MNIST.parquet'
    return pd.read_parquet(path)

def load_with_fetch_openml(): 
    return fetch_openml('mnist_784', version=1, return_X_y=True, parser="pandas")

mnist_data = load_from_local()
print(mnist_data.info())
print(mnist_data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 785 entries, pixel1 to class
dtypes: float64(784), int32(1)
memory usage: 419.0 MB
None
        pixel1   pixel2   pixel3   pixel4   pixel5   pixel6   pixel7   pixel8  \
count  70000.0  70000.0  70000.0  70000.0  70000.0  70000.0  70000.0  70000.0   
mean       0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
std        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
min        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
25%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
50%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
75%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
max        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

        pixel9  pixel10  ...      pixel776      pixel777      pixel778  \
count  70000.0  70000.0

## 2. Task b

Preprocess the data so, that all elements have elementwise (not: featurewise) Zero-Mean-Unit-Variance! 

In [20]:
# apply the z-score method in Pandas using the .mean() and .std() methods
def z_score(df):
    # copy the dataframe
    df_std = df.copy()
    # apply the z-score method
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
        
    return df_std
    
print("Mean: ", np.mean(mnist_data))
print("Standard derivation: ", np.std(mnist_data))

# call the z_score function
mnist_data_standardized = z_score(mnist_data)

print(mnist_data_standardized)
print("Mean: ", np.mean(mnist_data_standardized))
print("Standard derivation: ", np.std(mnist_data_standardized))

Mean:  33.349106733393995
Standard derivation:  pixel1      0.000000
pixel2      0.000000
pixel3      0.000000
pixel4      0.000000
pixel5      0.000000
              ...   
pixel781    0.000000
pixel782    0.000000
pixel783    0.000000
pixel784    0.000000
class       2.890174
Length: 785, dtype: float64
       pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
0         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
69995     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
69996     NaN     NaN     NaN     NaN     NaN

SECOND OPTION
Source: https://towardsdatascience.com/data-normalization-with-pandas-and-scikit-learn-7c1cc6ed6475

In [21]:
# create a scaler object
std_scaler = StandardScaler()
std_scaler
# fit and transform the data
df_std = pd.DataFrame(std_scaler.fit_transform(mnist_data), columns=mnist_data.columns)

print(df_std)

       pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
0         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
69995     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
69996     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
69997     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
69998     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
69999     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

       pixel10  ...  pixel7

## 3. Task c

Split the data into a train-, test-, and validation-set!


In [22]:
# Split the data into three partitions
# First into traings data and the test/validation data and then split the test/validation data
# The data, test and validation partitions will be equally sized
from sklearn.model_selection import train_test_split
data_train, data_test, data_validation = train_test_split(
    mnist_data, test_size=0.2, shuffle=False)

print("Training data:")
print(data_train)

print("Test data:")
print(data_test)

print("Validation data:")
print(data_validation)

ValueError: not enough values to unpack (expected 3, got 2)