In [1]:
import pandas as pd
import numpy as np
import zipfile
import os

from scipy.stats import entropy, skew
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [2]:
zip_files = 'homework.zip'

with zipfile.ZipFile(zip_files, 'r') as zip_ref:
    zip_ref.extractall('unzip_folder')

In [3]:
def load_data_frame(folder_path):

    data = pd.DataFrame()
    type_activity = os.path.basename(folder_path)
    for filename in os.listdir(folder_path):

      if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(folder_path, filename))
        data_d = pd.DataFrame(
        {
        'max_x': df['accelerometer_X'].max(),
        'min_x': df['accelerometer_X'].min(),
        'mean_x': df['accelerometer_X'].mean(),
        'std_x': df['accelerometer_X'].std(),

        'max_y': df['accelerometer_Y'].max(),
        'min_y': df['accelerometer_Y'].min(),
        'mean_y': df['accelerometer_Y'].mean(),
        'std_y': df['accelerometer_Y'].std(),

        'max_z': df['accelerometer_Z'].max(),
        'min_z': df['accelerometer_Z'].min(),
        'mean_z': df['accelerometer_Z'].mean(),
        'std_z': df['accelerometer_Z'].std(),
        'type_activity': type_activity,
        }, index=[0]
        )
        data = pd.concat([data, data_d]).reset_index(drop=True)

    return data

In [4]:
walk_path = 'unzip_folder/data/walking'
stairs_path = 'unzip_folder/data/stairs'
run_path = 'unzip_folder/data/running'
idle_path = 'unzip_folder/data/idle'


walk_df = load_data_frame(walk_path)
stairs_df = load_data_frame(stairs_path)
run_df = load_data_frame(run_path)
idle_df = load_data_frame(idle_path)

In [5]:
df = pd.concat([walk_df, stairs_df, run_df, idle_df]).reset_index(drop=True)

In [6]:
df

Unnamed: 0,max_x,min_x,mean_x,std_x,max_y,min_y,mean_y,std_y,max_z,min_z,mean_z,std_z,type_activity
0,2.250550,-18.598158,-4.820326,5.467887,7.584831,-25.866955,-10.978692,7.111740,19.685127,-20.738573,-0.810676,6.969859,walking
1,6.737284,-9.844957,-2.228842,3.556321,-0.344765,-19.632454,-10.042080,4.690954,18.698715,-12.607866,0.374134,5.853862,walking
2,6.737284,-9.892841,-2.237621,3.245055,-1.288081,-15.863980,-9.840169,4.115973,18.698715,-14.255077,-0.753535,6.163275,walking
3,1.853112,-14.456190,-4.655764,4.760885,-1.403002,-19.895817,-10.132740,5.080454,37.138855,-15.691598,-0.681071,10.253374,walking
4,0.914585,-21.346703,-6.795064,6.167943,0.349553,-25.905262,-11.950578,6.242697,12.775460,-17.659632,-3.784914,7.412519,walking
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,0.301669,0.181959,0.249157,0.029370,0.038307,-0.076614,-0.022665,0.026315,9.792285,9.739613,9.766907,0.015562,idle
6458,0.670377,-0.531513,0.003671,0.245185,6.957550,4.132392,5.714162,0.798776,9.097966,6.042965,7.742210,0.709080,idle
6459,3.830723,-0.507571,0.511561,0.873667,8.207323,-2.595315,2.178245,2.960407,10.084377,5.583278,8.802682,1.339663,idle
6460,2.341529,-1.728614,0.219469,0.590283,6.904878,-0.995988,0.438299,1.702251,10.515334,6.502652,9.625808,0.669446,idle


In [7]:

X = df.drop('type_activity', axis=1)
y = df['type_activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Random Forest
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train, y_train)
predictions_rf = model_rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, predictions_rf))

# SVM
model_svm = svm.SVC()
model_svm.fit(X_train, y_train)
predictions_svm = model_svm.predict(X_test)
print("SVM:")
print(classification_report(y_test, predictions_svm))

Random Forest:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       190
     running       1.00      1.00      1.00       677
      stairs       1.00      1.00      1.00        35
     walking       1.00      1.00      1.00       391

    accuracy                           1.00      1293
   macro avg       1.00      1.00      1.00      1293
weighted avg       1.00      1.00      1.00      1293

SVM:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       190
     running       1.00      1.00      1.00       677
      stairs       0.90      0.74      0.81        35
     walking       0.98      0.99      0.98       391

    accuracy                           0.99      1293
   macro avg       0.97      0.93      0.95      1293
weighted avg       0.99      0.99      0.99      1293



Обидві моделі добре підходять для цього типу задачі. RandomForest показала себе краще, ніж SVM для цього набору даних. Вона демонструє відмінну точність, повноту і F1-score 1.0 для всіх класів.

In [10]:
# Random Forest
param_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

random_rf = RandomizedSearchCV(RandomForestClassifier(), param_rf, n_iter=10, cv=5)
random_rf.fit(X_train, y_train)

print("Best parameters for Random Forest: ", random_rf.best_params_)

# SVM
param_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

random_svm = RandomizedSearchCV(svm.SVC(), param_svm, n_iter=10, cv=5)
random_svm.fit(X_train, y_train)

print("Best parameters for SVM: ", random_svm.best_params_)

Best parameters for Random Forest:  {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 20}
Best parameters for SVM:  {'kernel': 'poly', 'gamma': 1, 'C': 100}


RandomizedSearchCV працює дуже схоже до GridSearchCV, але замість того, щоб пробувати всі можливі комбінації параметрів, він випадковим чином вибирає певну кількість комбінацій. Це може бути набагато швидше, особливо якщо у вас багато параметрів для налаштування.