<a href="https://colab.research.google.com/github/Oleksandr190378/data-computing/blob/main/Hm5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Support Vector Machine vs. Random Forest Model: A Comparative Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score

from scipy.signal import resample
import os
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
def calculate_features(data):

    data = data.astype({'accelerometer_X': 'float64', 'accelerometer_Y': 'float64', 'accelerometer_Z': 'float64'})

    features = {}

    for axis, col in zip(['x', 'y', 'z'], ['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']):
        axis_data = data[col]
        features[f'Mean_{axis}'] = axis_data.mean()
        features[f'StdDev_{axis}'] = axis_data.std()
        features[f'Median_{axis}'] = axis_data.median()
        features[f'Range_{axis}'] = axis_data.max() - axis_data.min()


    features_df = pd.DataFrame([features])

    return features_df

In [4]:
def print_classification_report(y_true, y_pred, model_name):
    report = classification_report(y_true, y_pred)
    print(f"Classification Report for {model_name}:\n{report}")

Extracting domain features from motion sensor dataset

In [5]:
columns = ['Mean_x', 'StdDev_x',  'Median_x', 'Range_x',
           'Mean_y', 'StdDev_y',  'Median_y', 'Range_y',
           'Mean_z', 'StdDev_z',  'Median_z', 'Range_z',  'Activity']
features_df = pd.DataFrame(columns=columns)


In [6]:
base_path = '/content/drive/My Drive/Colab Notebooks/homework/data/'

activities = ['running', 'walking', 'idle', 'stairs']

for activity in activities:
    activity_path = os.path.join(base_path, activity)
    for file in os.listdir(activity_path):
        file_path = os.path.join(activity_path, file)
        df = pd.read_csv(file_path)

        features = calculate_features(df)
        features['Activity'] = activity
        features_df = pd.concat([features_df, features], ignore_index=True)


features_df.head()

Unnamed: 0,Mean_x,StdDev_x,Median_x,Range_x,Mean_y,StdDev_y,Median_y,Range_y,Mean_z,StdDev_z,Median_z,Range_z,Activity
0,5.955018,7.603872,4.161123,29.458258,8.974426,10.611267,11.310209,34.672828,0.105824,4.659165,-0.720655,17.640479,running
1,9.899386,9.753004,8.410831,31.713595,8.763257,11.674557,11.820173,36.966474,2.400746,8.854463,1.505953,38.632838,running
2,5.400202,9.230349,2.30801,38.891412,6.868486,10.66236,7.098808,49.90953,3.335921,7.70878,1.194707,37.033511,running
3,6.010244,7.558234,3.409344,36.08541,3.385561,9.057878,5.726931,29.166165,0.857443,6.217254,0.260968,22.960394,running
4,3.264893,6.464272,2.626439,30.765491,8.623117,14.395496,8.058883,47.907977,-2.116953,4.792169,-2.303222,18.511968,running


In [7]:
features_df.shape

(6462, 13)

In [8]:
features_df.iloc[:, :-1].corr()

Unnamed: 0,Mean_x,StdDev_x,Median_x,Range_x,Mean_y,StdDev_y,Median_y,Range_y,Mean_z,StdDev_z,Median_z,Range_z
Mean_x,1.0,0.671501,0.972467,0.601298,0.681804,0.628903,0.702828,0.5645,0.148997,0.198499,-0.004149,0.039225
StdDev_x,0.671501,1.0,0.633196,0.970118,0.572934,0.937052,0.570793,0.919821,-0.415652,0.641217,-0.551836,0.474454
Median_x,0.972467,0.633196,1.0,0.571636,0.68028,0.602364,0.696456,0.537406,0.160738,0.185207,0.013546,0.027107
Range_x,0.601298,0.970118,0.571636,1.0,0.53218,0.906041,0.5274,0.895895,-0.440743,0.659595,-0.568656,0.506542
Mean_y,0.681804,0.572934,0.68028,0.53218,1.0,0.584515,0.990337,0.481209,0.311414,-0.024253,0.179,-0.192274
StdDev_y,0.628903,0.937052,0.602364,0.906041,0.584515,1.0,0.578939,0.973409,-0.455825,0.620153,-0.580234,0.462335
Median_y,0.702828,0.570793,0.696456,0.5274,0.990337,0.578939,1.0,0.474938,0.317733,-0.027714,0.185343,-0.195245
Range_y,0.5645,0.919821,0.537406,0.895895,0.481209,0.973409,0.474938,1.0,-0.523301,0.677872,-0.635728,0.531827
Mean_z,0.148997,-0.415652,0.160738,-0.440743,0.311414,-0.455825,0.317733,-0.523301,1.0,-0.644718,0.960391,-0.653618
StdDev_z,0.198499,0.641217,0.185207,0.659595,-0.024253,0.620153,-0.027714,0.677872,-0.644718,1.0,-0.737691,0.942503


In [23]:
X = features_df.drop('Activity', axis=1)
y = features_df['Activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Support Vector Machine

In [25]:
svm_model = svm.SVC()
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
print_classification_report(y_test, svm_predictions, "SVM")

Cross-validation scores: [0.98865979 0.98968008 0.98452012 0.9876161  0.9876161 ]
Mean cross-validation score: 0.9876184396710392
Classification Report for SVM:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       235
     running       1.00      1.00      1.00       886
      stairs       0.94      0.50      0.65        30
     walking       0.97      1.00      0.98       465

    accuracy                           0.99      1616
   macro avg       0.98      0.87      0.91      1616
weighted avg       0.99      0.99      0.99      1616



Random Forest Model

In [26]:
rf_model = RandomForestClassifier()
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", rf_scores)
print("Mean cross-validation score:", rf_scores.mean())
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print_classification_report(y_test, rf_predictions, "Random Forest")

Cross-validation scores: [0.99793814 0.99896801 0.99896801 1.         0.99793602]
Mean cross-validation score: 0.9987620354707264
Classification Report for Random Forest:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       235
     running       1.00      1.00      1.00       886
      stairs       1.00      0.90      0.95        30
     walking       0.99      1.00      1.00       465

    accuracy                           1.00      1616
   macro avg       1.00      0.97      0.99      1616
weighted avg       1.00      1.00      1.00      1616



Conclusion: The cross-validation scores and classification reports for both the Support Vector Machine (SVM) and Random Forest models indicate high performance across all metrics. The SVM model achieved a mean cross-validation score of approximately 0.9876, with high precision, recall, and F1-scores for all classes except "stairs," where the recall was notably lower at 0.50. The overall accuracy of the SVM model was 0.99.

In contrast, the Random Forest model performed exceptionally well, achieving a mean cross-validation score of approximately 0.9986. It demonstrated perfect precision, recall, and F1-scores for all classes, including "stairs," where the recall improved to 0.90. The overall accuracy of the Random Forest model was 1.00.

Based on these results, the Random Forest model appears to be superior to the SVM model, as it not only achieves higher cross-validation scores but also shows more balanced and improved performance across all classes, including the challenging "stairs" class.

