# Model Building
We have observed a total of 561 features, which is a lot, so let us first find the most important features using RandomForestClassifier, then use a Multiclass Classification Model to classify with reduced features.

In [3]:
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m977.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.5.0-py3-none-any.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDow

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd

df = pd.read_csv('../data/processed/train.csv')
drop_selected = ["subject","Activity","ActivityName"]
x_train = df.drop(columns = drop_selected)
y_train = df["Activity"]

In [12]:
rfc = RandomForestClassifier(n_estimators=100, random_state = 42)
rfc.fit(x_train, y_train)

In [13]:
import numpy as np

importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]

In [15]:
N = 100
top_indices = indices[:N]
top_feature_names = x_train.columns[top_indices]

x_train_selected = x_train[top_feature_names]

print("Top", N, "features:")
print(top_feature_names.tolist())

Top 100 features:
['tGravityAcc-mean()-X', 'tGravityAcc-max()-X', 'angle(X,gravityMean)', 'tGravityAcc-mean()-Y', 'tGravityAcc-energy()-X', 'angle(Y,gravityMean)', 'tGravityAcc-min()-X', 'tGravityAcc-min()-Y', 'tGravityAcc-max()-Y', 'tGravityAcc-energy()-Y', 'tBodyAccJerk-std()-X', 'angle(Z,gravityMean)', 'tGravityAcc-arCoeff()-Z,2', 'fBodyAccMag-energy()', 'tGravityAccMag-std()', 'fBodyAccJerk-bandsEnergy()-1,16', 'tGravityAcc-energy()-Z', 'fBodyAccJerk-bandsEnergy()-1,16.1', 'fBodyAccJerk-max()-X', 'tBodyAccMag-std()', 'fBodyAcc-mean()-X', 'tGravityAcc-mean()-Z', 'tGravityAcc-min()-Z', 'fBodyAccMag-mad()', 'tGravityAcc-arCoeff()-Z,1', 'fBodyAccJerk-energy()-X', 'fBodyAccJerk-bandsEnergy()-1,24', 'tBodyAccJerkMag-energy()', 'fBodyAcc-bandsEnergy()-1,24', 'fBodyAcc-entropy()-X', 'tBodyAccJerk-sma()', 'tBodyGyroJerk-iqr()-Z', 'tGravityAcc-entropy()-Y', 'tBodyAcc-correlation()-X,Y', 'fBodyAcc-mad()-X', 'tGravityAcc-arCoeff()-Y,1', 'tGravityAcc-arCoeff()-X,2', 'tBodyAccJerk-mad()-Y', 'fBo

In [16]:
with open('top_features.txt', 'w') as f:
    for feature in top_feature_names:
        f.write(f"{feature}\n")

print(f"Top features saved to 'top_features.txt'.")

Top features saved to 'top_features.txt'.
