<a href="https://colab.research.google.com/github/SiracencoSerghei/DataScienceHW/blob/main/example_kaggle/les_5/les_5_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import zipfile

import scipy.io as scio
from scipy.fftpack import fft, rfft
from sklearn.svm import SVC

from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification

from sklearn import svm, datasets, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, f1_score, accuracy_score

---
# LOAD DATASET
---

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !ls '/content/drive/My Drive'

In [None]:
!unzip '/content/drive/My Drive/homework.zip' -d /content/homework  # Colab RAM
# !unzip '/content/drive/My Drive/homework.zip' -d '/content/drive/My Drive/homework'  # my google Drive

In [None]:
base_path = '/content/homework/data'
!ls {base_path}

In [None]:
activities = os.listdir(base_path)
print(activities)

In [None]:

for act in activities:
    path = os.path.join(base_path, act)

    if os.path.isdir(path):  # Переконуємося, що path є папкою
        frames = os.listdir(path)
        print(f'Folder "{act}" has: {len(frames)} files')
    else:
        print(f'Warning: "{act}" is not a valid directory!')

In [None]:
%%time
# Вказуємо шлях до каталогу з даними
folder = Path(base_path)
data_set = pd.DataFrame()

# Проходимо по всіх підкаталогах
for activity_folder in folder.iterdir():
    if not activity_folder.is_dir():
        continue

    # Проходимо по всіх файлах в підкаталозі
    for file in activity_folder.iterdir():
        if file.suffix != '.csv':
            continue

        # Читаємо CSV файл і додаємо стовпець activity
        df = pd.read_csv(file)
        df['activity'] = activity_folder.name
        data_set = pd.concat([data_set, df], ignore_index=True)

data_set

In [None]:

def get_stat_features(frame):
    # Видаляємо текстовий стовпець "activity", залишаємо тільки числові дані
    frame = frame.drop(columns=['activity'])

    # Обчислення статистичних фіч
    skewness = frame.skew(axis=0).values
    kurtosis = frame.kurt(axis=0).values
    maximum = frame.max(axis=0).values
    minimum = frame.min(axis=0).values
    mean = frame.mean(axis=0).values
    stddev = frame.std(axis=0).values
    variance = frame.var(axis=0).values
    median = frame.median(axis=0).values
    idxmax = frame.idxmax(axis=0).values
    idxmin = frame.idxmin(axis=0).values

    # Список назв статистичних фіч
    stat_feature_names = [
        'skew_X', 'skew_Y', 'skew_Z',
        'kurt_X', 'kurt_Y', 'kurt_Z',
        'max_X', 'max_Y', 'max_Z',
        'min_X', 'min_Y', 'min_Z',
        'mean_X', 'mean_Y', 'mean_Z',
        'std_X', 'std_Y', 'std_Z',
        'var_X', 'var_Y', 'var_Z',
        'median_X', 'median_Y', 'median_Z',
        'idxmax_X', 'idxmax_Y', 'idxmax_Z',
        'idxmin_X', 'idxmin_Y', 'idxmin_Z'
    ]

    # Об'єднуємо статистичні значення в один масив
    features = np.concatenate([
        skewness, kurtosis, maximum, minimum, mean, stddev, variance, median, idxmax, idxmin
    ], axis=0)

    # Обчислюємо кореляцію між акселерометрами
    correlation = frame.corr()
    corr_values = [
        correlation.loc['accelerometer_X', 'accelerometer_Y'],
        correlation.loc['accelerometer_X', 'accelerometer_Z'],
        correlation.loc['accelerometer_Y', 'accelerometer_Z']
    ]
    # Додаємо назви для кореляцій
    corr_feature_names = [
        'corr_XY', 'corr_XZ', 'corr_YZ'
    ]
    features = np.concatenate((features, corr_values), axis=0)

    # Обчислення MAE та RMSE
    mean_values = frame.mean(axis=0)
    mae_X = mean_absolute_error(frame['accelerometer_X'], np.full_like(frame['accelerometer_X'], mean_values['accelerometer_X']))
    mae_Y = mean_absolute_error(frame['accelerometer_Y'], np.full_like(frame['accelerometer_Y'], mean_values['accelerometer_Y']))
    mae_Z = mean_absolute_error(frame['accelerometer_Z'], np.full_like(frame['accelerometer_Z'], mean_values['accelerometer_Z']))

    rmse_X = np.sqrt(mean_squared_error(frame['accelerometer_X'], np.full_like(frame['accelerometer_X'], mean_values['accelerometer_X'])))
    rmse_Y = np.sqrt(mean_squared_error(frame['accelerometer_Y'], np.full_like(frame['accelerometer_Y'], mean_values['accelerometer_Y'])))
    rmse_Z = np.sqrt(mean_squared_error(frame['accelerometer_Z'], np.full_like(frame['accelerometer_Z'], mean_values['accelerometer_Z'])))

    metrics = np.array([mae_X, mae_Y, mae_Z, rmse_X, rmse_Y, rmse_Z])
    # Додаємо назви для MAE та RMSE
    metrics_feature_names = [
        'mae_X', 'mae_Y', 'mae_Z',
        'rmse_X', 'rmse_Y', 'rmse_Z'
    ]
    features = np.concatenate((features, metrics), axis=0)

    # Об'єднуємо всі назви фіч
    all_feature_names = stat_feature_names + corr_feature_names + metrics_feature_names

    # Повертаємо словник з назвами та значеннями фіч
    feature_dict = dict(zip(all_feature_names, features))
    return feature_dict

In [None]:
len(get_stat_features(data_set))

In [None]:
# get_stat_features(data_set)

---
# Data Preraration
---

In [None]:
data_set.describe()

In [None]:
data_set.info()

---
# Навчання моделі без часових ознак
---

In [None]:
# Функція для побудови confusion matrix
def plot_confusion_matrix(conf_matrices, classifiers, activities, axes):
    custom_cmap = sns.color_palette("ch:s=.5,r=-.75", as_cmap=True)

    for i, classifier in enumerate(classifiers):
        sns.heatmap(
            conf_matrices[i],
            annot=True,
            fmt='d',
            cmap=custom_cmap,
            xticklabels=activities,
            yticklabels=activities,
            ax=axes[i],
        )

        axes[i].set_title(f"Confusion Matrix for {classifier} Classifier")
        axes[i].set_xlabel("Predicted activity")
        axes[i].set_ylabel("True activity")

    plt.show()

In [None]:
# Основний код
features = data_set.columns[:-1]
X = data_set[features]
y = data_set[data_set.columns[-1]]

scaler = StandardScaler()
normalized_data = scaler.fit_transform(X)

norm_data = data_set.copy()
norm_data[features] = normalized_data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(norm_data[features], norm_data['activity'], test_size=0.3, stratify=norm_data['activity'])

In [None]:
np.array_equal(X, X_fft_stat)

In [None]:
print("Розмір тренувальної вибірки:", X_train.shape)
print("Розмір тестової вибірки:", X_test.shape


In [None]:
# Навчання моделей
model_svm = SVC(decision_function_shape='ovo', kernel='rbf',gamma=0.005, probability=True)
model_svm.fit(X_train, y_train)

In [None]:
y_test

In [None]:
y_test_pred = model_svm.decision_function(X_test)
y_test_pred

In [None]:
y_test_proba = model_svm.predict_proba(X_test)
y_test_proba

In [None]:
y_test_proba = np.argmax(y_test_proba, axis=1)
y_test_proba

In [None]:
count_idle = 0
count_running = 0
count_stairs = 0
count_walking = 0
for i in y_test_proba:
  if i == 0:
    count_idle += 1
  elif i == 1:
    count_running += 1
  elif i == 2:
    count_stairs += 1
  elif i == 3:
    count_walking += 1

In [None]:
count_idle

In [None]:
count_running

In [None]:
count_stairs

In [None]:
count_walking

In [None]:
accuracy = accuracy_score(y_test, y_test_proba)
accuracy

In [None]:
%%time
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train, y_train)

In [None]:
# Оцінка точності
score_svm = model_svm.score(X_test, y_test)
score_rf = model_rf.score(X_test, y_test)

print("Точність моделі SVM:", score_svm)
print("Точність моделі Random Forest:", score_rf)

In [None]:
# Прогнозування
y_pred_svm = model_svm.predict(X_test)
y_pred_rf = model_rf.predict(X_test)

In [None]:
# Обчислення confusion matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# Візуалізація confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(11, 4))