<a href="https://colab.research.google.com/github/Pelmenoff/data_science/blob/main/hw5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import zipfile
import os
import pandas as pd

zip_path = 'homework.zip'
extract_path = 'homework'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

data_dict = {}
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            user_id = file.split('.')[0]
            df = pd.read_csv(file_path)
            data_dict[user_id] = df

print(data_dict.keys())

dict_keys(['stairs-60', 'stairs-163', 'stairs-46', 'stairs-13', 'stairs-95', 'stairs-74', 'stairs-86', 'stairs-77', 'stairs-116', 'stairs-57', 'stairs-65', 'stairs-84', 'stairs-107', 'stairs-101', 'stairs-160', 'stairs-148', 'stairs-125', 'stairs-32', 'stairs-164', 'stairs-104', 'stairs-51', 'stairs-132', 'stairs-72', 'stairs-151', 'stairs-28', 'stairs-78', 'stairs-1', 'stairs-108', 'stairs-133', 'stairs-58', 'stairs-150', 'stairs-41', 'stairs-45', 'stairs-134', 'stairs-33', 'stairs-62', 'stairs-26', 'stairs-122', 'stairs-130', 'stairs-136', 'stairs-127', 'stairs-158', 'stairs-34', 'stairs-25', 'stairs-113', 'stairs-5', 'stairs-52', 'stairs-102', 'stairs-97', 'stairs-15', 'stairs-157', 'stairs-123', 'stairs-73', 'stairs-105', 'stairs-10', 'stairs-43', 'stairs-79', 'stairs-129', 'stairs-17', 'stairs-119', 'stairs-53', 'stairs-100', 'stairs-50', 'stairs-111', 'stairs-56', 'stairs-149', 'stairs-91', 'stairs-93', 'stairs-35', 'stairs-69', 'stairs-63', 'stairs-110', 'stairs-48', 'stairs-36'

In [5]:
import numpy as np

def extract_time_domain_features(data):
    features = {}
    for axis in ['X', 'Y', 'Z']:
        values = data[f'accelerometer_{axis}']
        features[f'mean_{axis}'] = np.mean(values)
        features[f'median_{axis}'] = np.median(values)
        features[f'std_{axis}'] = np.std(values)
        features[f'min_{axis}'] = np.min(values)
        features[f'max_{axis}'] = np.max(values)
        features[f'range_{axis}'] = np.max(values) - np.min(values)
        features[f'rms_{axis}'] = np.sqrt(np.mean(np.square(values)))
        features[f'var_{axis}'] = np.var(values)
    return features

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

all_features = []
all_labels = []

for user_id, df in data_dict.items():
    features = extract_time_domain_features(df)
    activity = user_id.split('-')[0]
    all_features.append(features)
    all_labels.append(activity)

label_encoder = LabelEncoder()
all_labels_encoded = label_encoder.fit_transform(all_labels)

X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels_encoded, test_size=0.2, random_state=42)

X_train_list = [list(features.values()) for features in X_train]
X_test_list = [list(features.values()) for features in X_test]


X_train_array = np.array(X_train_list)
X_test_array = np.array(X_test_list)

In [9]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_array, y_train)
svm_predictions = svm_model.predict(X_test_array)

svm_report = classification_report(y_test, svm_predictions, target_names=label_encoder.classes_)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_array, y_train)
rf_predictions = rf_model.predict(X_test_array)

rf_report = classification_report(y_test, rf_predictions, target_names=label_encoder.classes_)

print("SVM Classification Report:")
print(svm_report)
print("\nRandom Forest Classification Report:")
print(rf_report)

SVM Classification Report:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       194
     running       1.00      1.00      1.00       673
      stairs       0.94      0.72      0.82        40
     walking       0.97      0.99      0.98       386

    accuracy                           0.99      1293
   macro avg       0.98      0.93      0.95      1293
weighted avg       0.99      0.99      0.99      1293


Random Forest Classification Report:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       194
     running       1.00      1.00      1.00       673
      stairs       1.00      0.93      0.96        40
     walking       0.99      1.00      1.00       386

    accuracy                           1.00      1293
   macro avg       1.00      0.98      0.99      1293
weighted avg       1.00      1.00      1.00      1293



In [13]:
example_data = {
    'accelerometer_X': [2.978387, 0.43095600000000006, 13.924676999999999, 1.58975, 1.733402],
    'accelerometer_Y': [-0.948104000000000, 8.872911, 6.224925, -6.933608, 7.268797],
    'accelerometer_Z': [2.221819, 2.475605, 0.8906430000000001, -4.003105000000001, 2.662352]
}

example_features = extract_time_domain_features(pd.DataFrame(example_data))
example_features = extract_time_domain_features(pd.DataFrame(example_data, index=[1, 2, 3, 4, 5]))

example_features_list = list(example_features.values())
example_features_array = np.array([example_features_list])

svm_prediction = svm_model.predict(example_features_array)

rf_prediction = rf_model.predict(example_features_array)

predicted_activity_svm = label_encoder.inverse_transform(svm_prediction)
predicted_activity_rf = label_encoder.inverse_transform(rf_prediction)

print("SVM Model Prediction:", predicted_activity_svm[0])
print("Random Forest Model Prediction:", predicted_activity_rf[0])

SVM Model Prediction: running
Random Forest Model Prediction: running
