In [3]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

In [18]:
BASE_DIR = Path('data')
TRAIN_DIR = BASE_DIR / 'train'
TEST_FILE = BASE_DIR / 'test.csv'
META_FILE = BASE_DIR / 'meta_data.txt'

In [19]:
def print_dir_structure(path: Path, max_level=2, level=0):
    if level > max_level:
        return
    indent = ' ' * 4 * level
    for item in sorted(path.iterdir()):
        print(f"{indent}- {item.name}{'/' if item.is_dir() else ''}")
        if item.is_dir():
            print_dir_structure(item, max_level, level + 1)

print_dir_structure(BASE_DIR)

- meta_data.txt
- test.csv
- train/
    - sbj_0.csv
    - sbj_0_2.csv
    - sbj_1.csv
    - sbj_10.csv
    - sbj_11.csv
    - sbj_12.csv
    - sbj_13.csv
    - sbj_14.csv
    - sbj_14_2.csv
    - sbj_15.csv
    - sbj_16.csv
    - sbj_17.csv
    - sbj_18.csv
    - sbj_19.csv
    - sbj_2.csv
    - sbj_20.csv
    - sbj_21.csv
    - sbj_3.csv
    - sbj_4.csv
    - sbj_5.csv
    - sbj_6.csv
    - sbj_7.csv
    - sbj_8.csv
    - sbj_9.csv


In [23]:
sbj_files = sorted(TRAIN_DIR.glob('sbj_*.csv'))
print(len(sbj_files))
train_dfs = []
for file in sbj_files:
    sbj_id = file.stem  # z.B. 'sbj_0_2'
    df = pd.read_csv(file)
    df['subject'] = sbj_id
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print(f"Train-Data shape: {train_df.shape}")
train_df.head()

24


  df = pd.read_csv(file)


Train-Daten Gesamtshape: (3466400, 15)


Unnamed: 0,sbj_id,right_arm_acc_x,right_arm_acc_y,right_arm_acc_z,right_leg_acc_x,right_leg_acc_y,right_leg_acc_z,left_leg_acc_x,left_leg_acc_y,left_leg_acc_z,left_arm_acc_x,left_arm_acc_y,left_arm_acc_z,label,subject
0,0,1.11172,0.06455,0.042595,0.982813,0.113823,0.152618,0.978287,-0.111472,0.103445,-0.956057,-0.407509,-0.090828,,sbj_0
1,0,1.177174,0.231861,-0.003207,0.988545,0.143807,0.14488,0.979794,-0.100042,0.086984,-1.122597,-0.232949,-0.106347,,sbj_0
2,0,1.124248,0.264661,-0.027485,0.982286,0.155609,0.135133,0.982915,-0.095342,0.083391,-1.167835,-0.088288,-0.131609,,sbj_0
3,0,1.032746,0.236885,-0.066591,0.981618,0.141745,0.134765,0.981991,-0.126157,0.088388,-1.143375,0.015024,-0.145414,,sbj_0
4,0,0.974202,0.26205,-0.116387,0.988576,0.120453,0.143329,0.977757,-0.124765,0.113024,-1.063969,0.08097,-0.140224,,sbj_0


In [24]:
test_df = pd.read_csv(TEST_FILE)
print(f"Test-Data shape: {test_df.shape}")
test_df.head()

Test-Daten shape: (48936, 6)


Unnamed: 0,id,sbj_id,sensor_location,x_axis,y_axis,z_axis
0,0,22,right_arm,"[0.2428425714285714, 0.2134530714285714, 0.174...","[0.8463437142857143, 0.854221738095238, 0.8428...","[-0.4574545714285714, -0.4578691904761905, -0...."
1,1,22,right_arm,"[0.6752909910531751, 0.4247757439107542, 0.715...","[-0.9366750995461923, -0.6223881559309377, -0....","[-0.6175092749440796, -0.5824192453686647, -0...."
2,2,23,right_arm,"[0.04528874289978267, 0.02636863686942854, -0....","[0.5794056316368844, 0.5853422923083632, 0.646...","[0.5778143538332879, 0.5922718075175648, 0.873..."
3,3,23,right_arm,"[-0.4294609406487871, -0.6656510068112931, -0....","[-0.147632980836954, -0.014413570611691117, -0...","[-1.0071089537700262, -0.8174943548296542, -0...."
4,4,23,right_arm,"[2.476312835574577, 1.6733647285880453, 1.7557...","[-0.5983137218068052, -0.10164738944146791, -0...","[-2.0541462483911856, -1.6842387016850981, -1...."


In [25]:
with open(META_FILE, 'r') as f:
    lines = [ln.strip() for ln in f if ln.strip()]

subjects_meta = {}
current = None
for line in lines:
    if line.startswith('- sbj_'):
        key = line.split()[1].rstrip(':')
        subjects_meta[key] = {}
        current = key
    elif current and ':' in line:
        k, v = map(str.strip, line.split(':', 1))
        subjects_meta[current][k] = v

meta_df = pd.DataFrame.from_dict(subjects_meta, orient='index')
print(f"Metadaten shape: {meta_df.shape}")
meta_df.head()


Metadaten shape: (4, 12)


Unnamed: 0,- Duration,- Number of activities,- Month,- Time-of-day,- Location ID,- Weather conditions,Location Information,- Location 3,- Location 12,- Location 13,- Location 14,- Location 15
sbj_22,02:57.000,1,late-February,afternoon,13,"sunny, around 10°C",,,,,,
sbj_23,01:47.000,1,early-February,afternoon,14,"sunny, around 10°C",,,,,,
sbj_24,15:50.000,9,late-February,afternoon,15,"cloudy, around 5°C",,,,,,
sbj_25,15:49.000,9,early-March,midday,3,"cloudy, windy, around 5°C",,,,,,


In [26]:
eprint = print
print = eprint
print("subjects:", train_df['subject'].nunique())

train_df.sample(5)

num_cols = train_df.select_dtypes(include='number').columns.tolist()
print("columns", num_cols)
train_df[num_cols].describe()

if 'class' in train_df.columns:
    counts = train_df['class'].value_counts().sort_index()
    plt.figure()
    counts.plot(kind='bar')
    plt.title('Klassenverteilung')
    plt.xlabel('Klasse')
    plt.ylabel('Anzahl')
    plt.tight_layout()
    plt.show()
else:
    print("class not found")


Einzigartige Subjekte: 24
Numerische Spalten: ['sbj_id', 'right_arm_acc_x', 'right_arm_acc_y', 'right_arm_acc_z', 'right_leg_acc_x', 'right_leg_acc_y', 'right_leg_acc_z', 'left_leg_acc_x', 'left_leg_acc_y', 'left_leg_acc_z', 'left_arm_acc_x', 'left_arm_acc_y', 'left_arm_acc_z']
Spalte 'class' nicht gefunden.


In [27]:
missing = train_df.isna().sum()
missing[missing > 0]

left_arm_acc_x      51392
left_arm_acc_y      51392
left_arm_acc_z      51392
label             1377362
dtype: int64