In [12]:
import numpy as np
import pandas as pd
from sklearn import pipeline
from sklearn import preprocessing
from ydata_profiling import ProfileReport

In [13]:
dtypes = {
    "user": str,
    "gender": str,
    "age": np.int64,
    "how_tall_in_meters": np.float64,
    "weight": np.int64,
    "body_mass_index": np.float64,
    "x1": np.int64,
    "y1": np.int64,
    "z1": np.int64,
    "x2": np.int64,
    "y2": np.int64,
    "z2": np.int64,
    "x3": np.int64,
    "y3": np.int64,
    "z3": np.int64,
    "x4": np.int64,
    "y4": np.int64,
    "z4": np.int64,
    "class": str,
}


def errorproofZ4(x):
    try:
        return np.int64(x)
    except ValueError:
        return np.NaN


df = pd.read_csv(
    "dataset/dataset-HAR-PUC-Rio.csv",
    sep=";",
    dtype=dtypes,
    converters={
        "how_tall_in_meters": lambda x: np.float64(x.replace(",", ".")),
        "body_mass_index": lambda x: np.float64(x.replace(",", ".")),
        "z4": errorproofZ4,
    },
)

profile = ProfileReport(
    df[
        [
            "x1",
            "y1",
            "z1",
            "x2",
            "y2",
            "z2",
            "x3",
            "y3",
            "z3",
            "x4",
            "y4",
            "z4",
        ]
    ],
    title="HAR-PUC measurements profile",
)

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [14]:
enableProfiling:bool = False

In [15]:
if enableProfiling:
    profile.to_widgets()

In [16]:
# drop the only bad value
df.dropna(inplace=True)

In [17]:
if enableProfiling:
    full_profile = ProfileReport(df, title="Full HAR-PUC profile")
    full_profile.to_widgets()
    full_profile.to_file("documents/full_data_report.html")

## Scaling
It is clearly visible, that the data is from a few people.
The scale of the variables is very different, but most of them are not very skewed, they mainly show a gaussian distribution, or gaussina mixture distributions.
Some of the data seems to be truncated.
## Standard Scaling 
To have all of the data in the same range the measured variables could be transformed to a standard value.
Also the user values should be transformed to Standard range, as they are related to the scale of the measured variables. But due to the low number of users only one variable to represent them would be enough, so variables like gender or age or name could be easily dropped.

## calss conversion
Calsses should be converted to integer as well

In [18]:
df.columns

Index(['user', 'gender', 'age', 'how_tall_in_meters', 'weight',
       'body_mass_index', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2', 'x3', 'y3', 'z3',
       'x4', 'y4', 'z4', 'class'],
      dtype='object')

In [19]:
ids = {j: i for i, j in enumerate(df["class"].unique())}
ids

{'sitting': 0, 'sittingdown': 1, 'standing': 2, 'standingup': 3, 'walking': 4}

In [20]:
cleaned = df.copy()
#apply mapping
cleaned["class"] = df["class"].apply(lambda x: ids[x])

#only keep sensor parameters
cleaned.drop(["user","gender","age","how_tall_in_meters","weight","body_mass_index",], axis=1, inplace=True)

In [21]:
cleaned.to_parquet("dataset/unscaled_train_data.parquet")

In [22]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

scaled_cleaned = cleaned.copy()

#scaler = MinMaxScaler(feature_range=(0,1))
scaler = StandardScaler()
value_columns = [
    "x1",
    "y1",
    "z1",
    "x2",
    "y2",
    "z2",
    "x3",
    "y3",
    "z3",
    "x4",
    "y4",
    "z4",
]
scaled_cleaned[value_columns] = scaler.fit_transform(cleaned[value_columns])

scaled_cleaned.to_parquet("dataset/scaled_train_data.parquet")


In [23]:
#scaler = MinMaxScaler(feature_range=(0,1))
scaler = MinMaxScaler()
value_columns = [
    "x1",
    "y1",
    "z1",
    "x2",
    "y2",
    "z2",
    "x3",
    "y3",
    "z3",
    "x4",
    "y4",
    "z4",
]
scaled_cleaned[value_columns] = scaler.fit_transform(cleaned[value_columns])

scaled_cleaned.to_parquet("dataset/minmaxscaled_train_data.parquet")
