In [65]:
import pandas as pd
import os
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [56]:
zip_file_path = '/homework.zip'
target_folder = '/content'
os.makedirs(target_folder, exist_ok = True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(target_folder)

work_data = os.path.join(target_folder, 'data')

all_data = pd.DataFrame()

for folder in os.listdir(work_data):
  folder_name = os.path.join(work_data, folder)

  if not os.path.isdir(folder_name):
    continue

  for file in os.listdir(folder_name):
    file_path = os.path.join(folder_name, file)

    if not file.endswith('.csv'):
      continue

    df = pd.read_csv(file_path)
    df['activity'] = folder

    all_data = pd.concat([all_data, df], ignore_index=True)

In [58]:
all_data = all_data.sample(frac=1).reset_index(drop=True)
all_data.head(5)

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,31.172506,36.11414,15.361198,running
1,0.881066,16.294937,12.133814,running
2,0.268151,-0.023942,9.744401,idle
3,-3.433285,-14.355634,-5.027823,running
4,-8.322244,-2.839523,-12.018892,walking


In [4]:
all_data.shape

(193860, 4)

In [59]:
X = all_data.drop(['activity'], axis=1)
X

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z
0,31.172506,36.114140,15.361198
1,0.881066,16.294937,12.133814
2,0.268151,-0.023942,9.744401
3,-3.433285,-14.355634,-5.027823
4,-8.322244,-2.839523,-12.018892
...,...,...,...
193855,8.164228,-11.631032,-1.034295
193856,20.580557,6.363788,0.804452
193857,2.628834,-4.697424,-3.892972
193858,4.050989,-9.299079,-8.269572


In [8]:
y = all_data['activity']
y

0         walking
1         walking
2         walking
3         running
4         walking
           ...   
193855    walking
193856       idle
193857    running
193858    walking
193859    running
Name: activity, Length: 193860, dtype: object

- Розділення на тренувальний та тестову вибірки

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- Часові ознаки

In [10]:
X_train['time_mean'] = X_train.mean(axis=1)
X_test['time_mean'] = X_test.mean(axis=1)
X_train, X_test

(        accelerometer_X  accelerometer_Y  accelerometer_Z  time_mean
 186383         3.122039        -4.481945        -3.509899  -1.623268
 85476         -1.517924        -5.214571         1.034295  -1.899400
 91093        -12.220005       -16.132132         5.329493  -7.674215
 89073         -1.508347       -10.467450        -1.034295  -4.336697
 124211         3.964798        -0.253785        -0.090980   1.206678
 ...                 ...              ...              ...        ...
 119879        -7.920019         0.651223        32.455800   8.395668
 103694         9.768343        30.166940        -0.325611  13.203224
 131932        -1.426944         6.493075        -3.122039   0.648031
 146867        -0.081403         0.220267         9.773131   3.303998
 121958         0.229843         0.023942         9.792285   3.348690
 
 [155088 rows x 4 columns],
         accelerometer_X  accelerometer_Y  accelerometer_Z  time_mean
 171248        -0.416591       -11.080365        -2.097321  

- Класифікації

In [20]:
svm_model = SVC()
rfc_model = RandomForestClassifier()

- SVC тренування

In [23]:
%%time
svm_model.fit(X_train, y_train)



CPU times: user 6min 17s, sys: 531 ms, total: 6min 17s
Wall time: 6min 20s


- RFC тренування

In [24]:
%%time
rfc_model.fit(X_train, y_train)

CPU times: user 22.5 s, sys: 41.8 ms, total: 22.5 s
Wall time: 22.6 s


- Прогнозування на тестових даних

In [29]:
%%time
svm_prediction = svm_model.predict(X_test)

CPU times: user 1min 36s, sys: 91.1 ms, total: 1min 36s
Wall time: 1min 37s


In [28]:
%%time
rfc_prediction = rfc_model.predict(X_test)

CPU times: user 784 ms, sys: 2.97 ms, total: 787 ms
Wall time: 939 ms


- Оцінка результатів

In [31]:
svm_accuracy = accuracy_score(y_test, svm_prediction)
rfc_accuracy = accuracy_score(y_test, rfc_prediction)

print("Accuracy (SVM):", svm_accuracy)
print("Accuracy (Random Forest):", rfc_accuracy)

Accuracy (SVM): 0.8928350355926957
Accuracy (Random Forest): 0.9997936655318271


In [37]:
svm_report = classification_report(y_test, svm_prediction)
print("SVM Results:")
print(svm_report)

rfc_report = classification_report(y_test, rfc_prediction)
print("RFC Results:")
print(rfc_report)


SVM Results:
              precision    recall  f1-score   support

        idle       0.96      0.99      0.97      6220
     running       0.93      0.90      0.92     20519
      stairs       1.00      0.00      0.00      1002
     walking       0.79      0.91      0.85     11031

    accuracy                           0.89     38772
   macro avg       0.92      0.70      0.68     38772
weighted avg       0.90      0.89      0.88     38772

RFC Results:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00      6220
     running       1.00      1.00      1.00     20519
      stairs       1.00      1.00      1.00      1002
     walking       1.00      1.00      1.00     11031

    accuracy                           1.00     38772
   macro avg       1.00      1.00      1.00     38772
weighted avg       1.00      1.00      1.00     38772



На підставі розрахунків:


*   Навчання моделі SVC тривало 6 хвилин 20 секунд

*   Час навчання Random Forest становив 22,6 секунди.


Прогнозування тестових даних зайняло:

*   Для моделі SVC - 1 хвилини 37 секунд
*   Для випадкового лісу - менше секунди

Також слід зазначити, що згідно з метрикою Random Forest дає більш точні результати прогнозування.