In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score

In [2]:
from pathlib import Path

DATASET_KEYS = ("d0s0", "d0s1", "d1s0", "d1s1")
SAVED_DATAFRAME_BASE = Path("/content/")
URL = "https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO&export=download&authuser=0"
OUTPUT = Path("/content/homework.zip")
CSV_DATA_PATH  = Path("/content/data")

In [3]:
if not OUTPUT.is_file():
  !wget -O $OUTPUT $URL

if OUTPUT.is_file() and not CSV_DATA_PATH.is_dir():
  !unzip -q -o $OUTPUT

--2024-02-25 11:49:12--  https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 108.177.120.132, 2607:f8b0:4001:c18::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|108.177.120.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4211746 (4.0M) [application/octet-stream]
Saving to: ‘/content/homework.zip’


2024-02-25 11:49:41 (30.1 MB/s) - ‘/content/homework.zip’ saved [4211746/4211746]



In [4]:
if CSV_DATA_PATH.is_dir():
  class_path = list(CSV_DATA_PATH.iterdir())
  class_list = list(d.name for d in class_path)
  print(class_list)

['stairs', 'running', 'idle', 'walking']


In [5]:
def gen_saved_name(key):
  return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

In [6]:
df_set = {}
X_set = {}
y_set = {}
reports = {}

In [7]:
skip_load = False
for key in DATASET_KEYS:
  filename = gen_saved_name(key)
  if not skip_load and filename.is_file():
    df_set[key] = pd.read_feather(filename)
    X_set[key] = df_set[key].iloc[:,:-1]
    y_set[key] = df_set[key].iloc[:,-1]
    print(f"DATASET {key}. Loaded. shape: {df_set[key].shape}")
  else:
    df_set[key] = pd.DataFrame()

# Побудова датафрейму

In [8]:
def save_dataset(key,df):
  filename = gen_saved_name(key)
  if not df.empty and not filename.is_file():
    df.to_feather(filename)
def flatten_frame(frame):
        columns=[f"{col}_{i}" for i in range(frame.shape[0]) for col in frame.columns]
        return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)
def add_stat_feature_frame(frame, rows):
  features = []
  for col_id in range(0,3):
    col = frame.iloc[:,col_id]
    features.append(pd.DataFrame([col.mean()] * rows,columns=[f'{col.name}_mean']))
    features.append(pd.DataFrame([col.max()] * rows,columns=[f'{col.name}_max']))
    features.append(pd.DataFrame([col.min()] * rows,columns=[f'{col.name}_min']))
    features.append(pd.DataFrame([col.quantile(0.75)-col.quantile(0.25)] * rows,columns=[f'{col.name}_interquartile_range']))
    features.append(pd.DataFrame([col.idxmin()] * rows,columns=[f'{col.name}_index_of_minimum_value']))
    features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows,columns=[f'{col.name}_mean_of_absolute_deviation']))
    features.append(pd.DataFrame([col.median()] * rows,columns=[f'{col.name}_median']))
    features.append(pd.DataFrame([col.std()] * rows,columns=[f'{col.name}_standard_deviation']))
    features.append(pd.DataFrame([np.sqrt(np.mean(col)**2)] * rows,columns=[f'{col.name}_root_mean_square_error']))

  result = pd.concat(features, axis=1)
  return result
def prepare_dataset(class_path: list[Path], flatten = True, stat_feture = True, limit_frames = None):
  dfws = []
  for class_id, work_class_path in enumerate(class_path):
    list_files = list(sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1])))
    print(f"Importing class '{work_class_path.name:7}' : {class_id}. Frames: {len(list_files)}")
    for i, filename in enumerate(list_files):
      df_w: pd.DataFrame = pd.read_csv(filename)
      addon_features = [df_w]
      if flatten:
        addon_features = [flatten_frame(df_w)]
      if stat_feture:
        addon_features.append(add_stat_feature_frame(df_w,addon_features[0].shape[0]))
      df_w = pd.concat(addon_features, axis=1)
      df_w['class'] = class_id
      dfws.append(df_w)
      if limit_frames and (i > limit_frames):
        break
  df = pd.concat(dfws, axis=0, ignore_index=True)
  print(df.shape)
  return df

In [9]:
limit_frames = None
for f in range(2):
  for s in range(2):
    key_set = f"d{f}s{s}"
    print(f"\nDATASET {key_set}. Where flatten={bool(f)}, stat_feture={bool(s)}")
    if df_set.get(key_set) is not None and (not df_set[key_set].empty):
      print("Alredy loaded, skipped")
      continue
    df_set[key_set] = prepare_dataset(class_path, flatten = f, stat_feture = s, limit_frames = limit_frames)
    save_dataset(key_set, df_set[key_set])
    X_set[key_set] = df_set[key_set].iloc[:,:-1]
    y_set[key_set] = df_set[key_set].iloc[:,-1]


DATASET d0s0. Where flatten=False, stat_feture=False
Importing class 'stairs ' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
(193860, 4)

DATASET d0s1. Where flatten=False, stat_feture=True
Importing class 'stairs ' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
(193860, 31)

DATASET d1s0. Where flatten=True, stat_feture=False
Importing class 'stairs ' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
(6462, 91)

DATASET d1s1. Where flatten=True, stat_feture=True
Importing class 'stairs ' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
(6462, 118)


In [10]:
key_set = list(df_set.keys())[0]
for x in df_set[key_set]['class'].unique():
  count = df_set[key_set]['class'][df_set[key_set]['class'] == x].count()
  prop = count / df_set[key_set].shape[0]
  print(f"class: {x}, rows: {count:7}, {class_list[x]:7}, prop: {prop:.4}" )

class: 0, rows:    4950, stairs , prop: 0.02553
class: 1, rows:  102240, running, prop: 0.5274
class: 2, rows:   31170, idle   , prop: 0.1608
class: 3, rows:   55500, walking, prop: 0.2863


# Порівняння

In [11]:
models = {"SVC": lambda: SVC(),
          "SVC_Linear": lambda: SVC(kernel="linear"),
          "RandomForestClassifier": lambda: RandomForestClassifier()}
reports = {}
skip_models = {
    "SVC_Linear": ["d0s0"]
}

In [12]:
SEED = 42
print("Models fit and prepare report")
for key in X_set.keys():
  print("-"*80)
  print(f"DATASET {key}. shape: {df_set[key].shape}")
  X = X_set[key]
  y = y_set[key]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
  if reports.get(key) is None:
    reports[key] = {}
  for model, classification in models.items():
    print(f"\n- classification: {model}")
    if reports[key].get(model):
      print("   alredy fit, skipped")
      continue
    if key in skip_models.get(model,[]):
      print("   skip this model")
      continue
    clf = classification()
    %time clf.fit(X_train, y_train)
    %time y_test_predict = clf.predict(X_test)
    reports[key][model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)

Models fit and prepare report
--------------------------------------------------------------------------------
DATASET d0s0. shape: (193860, 4)

- classification: SVC
CPU times: user 4min 8s, sys: 576 ms, total: 4min 9s
Wall time: 4min 11s
CPU times: user 1min 47s, sys: 61.4 ms, total: 1min 47s
Wall time: 1min 47s

- classification: SVC_Linear
   skip this model

- classification: RandomForestClassifier
CPU times: user 13.2 s, sys: 4.91 ms, total: 13.2 s
Wall time: 13.3 s
CPU times: user 1.1 s, sys: 3.97 ms, total: 1.11 s
Wall time: 1.13 s
--------------------------------------------------------------------------------
DATASET d0s1. shape: (193860, 31)

- classification: SVC
CPU times: user 30.4 s, sys: 137 ms, total: 30.5 s
Wall time: 30.6 s
CPU times: user 17.6 s, sys: 13 ms, total: 17.6 s
Wall time: 17.6 s

- classification: SVC_Linear
CPU times: user 1min 45s, sys: 205 ms, total: 1min 45s
Wall time: 1min 45s
CPU times: user 7.15 s, sys: 4.97 ms, total: 7.15 s
Wall time: 7.19 s

- c

# Класифікація

In [13]:
print(f"{limit_frames=}")
for dset in reports.keys():
  for model in reports[dset].keys():
    print("-"*80)
    print(f"Data set: {dset}, shape: {df_set[dset].shape}, model: {model}")
    print(reports[dset][model])

limit_frames=None
--------------------------------------------------------------------------------
Data set: d0s0, shape: (193860, 4), model: SVC
              precision    recall  f1-score   support

      stairs     1.0000    0.0027    0.0054      1485
     running     0.9327    0.8986    0.9153     30672
        idle     0.9549    0.9843    0.9694      9351
     walking     0.7944    0.9047    0.8460     16650

    accuracy                         0.8913     58158
   macro avg     0.9205    0.6976    0.6840     58158
weighted avg     0.8984    0.8913    0.8809     58158

--------------------------------------------------------------------------------
Data set: d0s0, shape: (193860, 4), model: RandomForestClassifier
              precision    recall  f1-score   support

      stairs     1.0000    0.9953    0.9976      1485
     running     0.9996    0.9999    0.9998     30672
        idle     0.9996    0.9994    0.9995      9351
     walking     0.9998    0.9996    0.9997     16650



# Висновки

Різні набори даних були створені з файлів CSV, завантажених із різних папок, кожна з яких названа відповідно до діяльності.

У наборі даних «d0s0» модель SVC-Linear була пропущена під час роботи, тому обробка її може вимагати багато часу.

Для аналізу використовувалися моделі SVC, SVC-Linear і RandomForestClassifier.

Найкращий результат для всіх наборів даних показала модель RandomForestClassifier.

Набір даних "d0s1" досяг ідеальної точності 1,0000, що робить його найкращим серед наданих наборів даних.


Ended