<a href="https://colab.research.google.com/github/VadymBoyko/DS-HW01/blob/main/HW5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [110]:
import os
import zipfile
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, entropy
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [69]:
zip_ref = zipfile.ZipFile('homework.zip', 'r')
zip_ref.extractall('')

In [70]:
# формуємо фрейми з різними видами діяльності
def load_raw_data(folder_path):
    df_raw_data = pd.DataFrame()
    file_list = os.listdir(folder_path)
    for file_index, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df['File_Index'] = file_index
            df_raw_data = pd.concat([df_raw_data, df], ignore_index=True)
    return df_raw_data

df_raw_idle = load_raw_data('/content/data/idle/')
df_raw_running = load_raw_data('/content/data/running/')
df_raw_stairs = load_raw_data('/content/data/stairs/')
df_raw_walking = load_raw_data('/content/data/walking/')

In [89]:
# розрахування фічів для фреймів
df_idle_features = pd.DataFrame()
df_running_features = pd.DataFrame()
df_stairs_features = pd.DataFrame()
df_walking_features = pd.DataFrame()

def calculate_features(group):
    features = {}
    features['mean_X'] = group['accelerometer_X'].mean()
    features['mean_Y'] = group['accelerometer_Y'].mean()
    features['mean_Z'] = group['accelerometer_Z'].mean()
    features['var_X'] = group['accelerometer_X'].var()
    features['var_Y'] = group['accelerometer_Y'].var()
    features['var_Z'] = group['accelerometer_Z'].var()
    features['std_X'] = group['accelerometer_X'].std()
    features['std_Y'] = group['accelerometer_Y'].std()
    features['std_Z'] = group['accelerometer_Z'].std()
    features['median_X'] = group['accelerometer_X'].median()
    features['median_Y'] = group['accelerometer_Y'].median()
    features['median_Z'] = group['accelerometer_Z'].median()
    features['max_X'] = group['accelerometer_X'].max()
    features['max_Y'] = group['accelerometer_Y'].max()
    features['max_Z'] = group['accelerometer_Z'].max()
    features['min_X'] = group['accelerometer_X'].min()
    features['min_Y'] = group['accelerometer_Y'].min()
    features['min_Z'] = group['accelerometer_Z'].min()
    #features['entropy_X'] = entropy(group['accelerometer_X'])
    #features['entropy_Y'] = entropy(group['accelerometer_Y'])
    #features['entropy_Z'] = entropy(group['accelerometer_Z'])
    features['skew_X'] = skew(group['accelerometer_X'])
    features['skew_Y'] = skew(group['accelerometer_Y'])
    features['skew_Z'] = skew(group['accelerometer_Z'])
    features['kurt_X'] = kurtosis(group['accelerometer_X'])
    features['kurt_Y'] = kurtosis(group['accelerometer_Y'])
    features['kurt_Z'] = kurtosis(group['accelerometer_Z'])
    features['iqr_X'] = group['accelerometer_X'].quantile(0.75) - group['accelerometer_X'].quantile(0.25)
    features['iqr_Y'] = group['accelerometer_Y'].quantile(0.75) - group['accelerometer_Y'].quantile(0.25)
    features['iqr_Z'] = group['accelerometer_Z'].quantile(0.75) - group['accelerometer_Z'].quantile(0.25)
    features['mad_X'] = np.mean(np.abs(group['accelerometer_X'] - group['accelerometer_X'].mean()))
    features['mad_Y'] = np.mean(np.abs(group['accelerometer_Y'] - group['accelerometer_Y'].mean()))
    features['mad_Z'] = np.mean(np.abs(group['accelerometer_Z'] - group['accelerometer_Z'].mean()))
    features['corr_XY'] = group['accelerometer_X'].corr(group['accelerometer_Y'])
    features['corr_XZ'] = group['accelerometer_X'].corr(group['accelerometer_Z'])
    features['corr_YZ'] = group['accelerometer_Y'].corr(group['accelerometer_Z'])
    return pd.Series(features)

# Розрахунок ознак для кожного значення File_Index
df_running_features = df_raw_running.groupby('File_Index').apply(calculate_features)
df_running_features['activity_type'] = 0
df_stairs_features = df_raw_stairs.groupby('File_Index').apply(calculate_features)
df_stairs_features['activity_type'] = 1
df_walking_features = df_raw_walking.groupby('File_Index').apply(calculate_features)
df_walking_features['activity_type'] = 2
df_idle_features = df_raw_idle.groupby('File_Index').apply(calculate_features)
df_idle_features['activity_type'] = 3

In [90]:
df = pd.concat([df_running_features, df_stairs_features, df_walking_features, df_idle_features]).reset_index(drop=True)

In [91]:
X = np.array(df.iloc[:,0:-1])
y = np.array(df.iloc[:,-1])

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

**SVM**

In [93]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_accuracy = svm_model.score(X_test, y_test)

**Random Forest**

In [94]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_accuracy = rf_model.score(X_test, y_test)

**Перевірка**

In [95]:
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

SVM Accuracy: 0.974477958236659
Random Forest Accuracy: 0.9992266047950503


на підставі даних навчання та перевірки відправцювання моделі, можна зробити висновок, що метод Random Forest показав кращі показники прогнозування/класифікації вида активності

**намагаємось покращити результати**

In [101]:
C = 4
svc_rbf = SVC(
    kernel="rbf",
    C=C,
    decision_function_shape='ovr',
    probability=True,
    gamma=0.05,
    cache_size=500).fit(X_train, y_train)

params = {
    "C": [0.01, 1, 10, 20.],
    "gamma": [0.01, 0.05, 0.2, 0.4,],
}

svc_opt=GridSearchCV(svc_rbf, params, scoring='accuracy', cv=5, verbose=True).fit(X_train, y_train)
svc_opt.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


{'C': 10, 'gamma': 0.01}

In [102]:
svc_best = SVC(kernel='rbf', C=10, gamma=0.01)
svc_best.fit(X_train, y_train)

In [103]:
svс_best_accuracy = svc_best.score(X_test, y_test)

In [105]:
print("SVM Accuracy:", svm_accuracy)
print("SVM best Accuracy:", svс_best_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

SVM Accuracy: 0.974477958236659
SVM best Accuracy: 0.9938128383604021
Random Forest Accuracy: 0.9992266047950503


In [106]:
rf_best_model = RandomForestClassifier()
param_grid = {'n_estimators': [10, 50, 100, 200, 500]}
grid_search = GridSearchCV(estimator=rf_best_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best n_estimators:", grid_search.best_params_['n_estimators'])

Best n_estimators: 100


In [108]:
rf_best_model = RandomForestClassifier(n_estimators = 100)
rf_best_model.fit(X_train, y_train)
rf_best_accuracy = rf_best_model.score(X_test, y_test)

In [109]:
print("SVM Accuracy:", svm_accuracy)
print("SVM best Accuracy:", svс_best_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("Random best Forest Accuracy:", rf_best_accuracy)

SVM Accuracy: 0.974477958236659
SVM best Accuracy: 0.9938128383604021
Random Forest Accuracy: 0.9992266047950503
Random best Forest Accuracy: 1.0


In [116]:
y_pred = rf_best_model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')

print("F1-score:", f1)

F1-score: 1.0


**ВИСНОВОК**

навіть без усяких настроювань, для даних наборів даних метод Random Forest дає 99,9% коретних відповідей, після налаштувань з n_estimators відсоток піднявся до 100%. Метод SVM без налаштувань давав 97,4% з налаштуваннями 99,4%