# Modeling

---


In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
import pickle

### 1. Data Preparation

In [14]:
df = pd.read_pickle("../data/interim/03_data_features.pkl")

df_train = df.drop(["participant", "category", "set"], axis=1)

X = df_train.drop("label", axis=1)
y = df_train["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

### 2. Feature Selection

In [17]:
X_fs = X.drop('duration', axis=1).copy()
y_fs = y.copy()

dt = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(dt, n_features_to_select=10)

sfs.fit(X_fs, y_fs)

selected_feature_indices = sfs.get_support(indices=True)
selected_feature_names = [X_fs.columns[i] for i in selected_feature_indices]


In [18]:
selected_feature_names

['acc_y_temp_mean_ws_5',
 'gyr_z_temp_std_ws_5',
 'gyr_r_temp_mean_ws_5',
 'acc_x_freq_0.0_Hz_ws_14',
 'acc_y_freq_0.0_Hz_ws_14',
 'acc_z_freq_0.0_Hz_ws_14',
 'acc_z_freq_2.143_Hz_ws_14',
 'gyr_z_freq_0.714_Hz_ws_14',
 'acc_r_freq_1.786_Hz_ws_14',
 'gyr_r_freq_0.0_Hz_ws_14']

### 3. Choose the model

In [20]:
params = [
  {
  "min_samples_leaf": [2, 10, 50, 100, 200],
  "n_estimators": [10, 50, 100],
  "criterion": ["gini", "entropy"],
}
]

rf_model = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring="accuracy")

### 4. Train the model


In [21]:
rf_model.fit(X_train[selected_feature_names], y_train)

In [22]:
print(rf_model.best_params_)
print(rf_model.best_estimator_)

{'criterion': 'entropy', 'min_samples_leaf': 2, 'n_estimators': 100}
RandomForestClassifier(criterion='entropy', min_samples_leaf=2)


### 5. Evaluate the model


In [24]:
pred_y_train = rf_model.predict(X_train[selected_feature_names])
pred_y_test = rf_model.predict(X_test[selected_feature_names])

print('Training Performance')
print(f'\tAccuracy: {accuracy_score(y_train, pred_y_train).round(3)}')
print(f"\tF1-Score: {f1_score(y_train, pred_y_train, average='weighted').round(3)}")
print(f"\tRecall: {recall_score(y_train, pred_y_train, average='weighted').round(3)}")
print(f"\tPrecision: {precision_score(y_train, pred_y_train, average='weighted').round(3)}")

print('\nTesting Performance')
print(f'\tAccuracy: {accuracy_score(y_test, pred_y_test).round(4)}')
print(f"\tF1-Score: {f1_score(y_test, pred_y_test, average='weighted').round(4)}")
print(f"\tRecall: {recall_score(y_test, pred_y_test, average='weighted').round(4)}")
print(f"\tPrecision: {precision_score(y_test, pred_y_test, average='weighted').round(4)}")

Training Performance
	Accuracy: 1.0
	F1-Score: 1.0
	Recall: 1.0
	Precision: 1.0

Testing Performance
	Accuracy: 0.9907
	F1-Score: 0.9907
	Recall: 0.9907
	Precision: 0.9907


In [26]:
score_dict = {
    "Accuracy": [(accuracy_score(y_train, pred_y_train) * 100).round(2),
                 (accuracy_score(y_test, pred_y_test) * 100).round(2)],
    "F1-Score": [(f1_score(y_train, pred_y_train, average='weighted') * 100).round(2),
                 (f1_score(y_test, pred_y_test, average='weighted') * 100).round(2)],
    "Recall" : [(recall_score(y_train, pred_y_train, average='weighted') * 100).round(2),
                (recall_score(y_test, pred_y_test, average='weighted') * 100).round(2)],
    "Precision" : [(precision_score(y_train, pred_y_train, average='weighted') * 100).round(2),
                   (precision_score(y_test, pred_y_test, average='weighted') * 100).round(2)]
  }

score = pd.DataFrame(score_dict, index = ['Train', 'Test'])

score

Unnamed: 0,Accuracy,F1-Score,Recall,Precision
Train,99.97,99.97,99.97,99.97
Test,99.07,99.07,99.07,99.07


In [27]:
pickle.dump(rf_model, open('model.pkl', 'wb'))

In [28]:
pickled_model = pickle.load(open('model.pkl', 'rb'))
pickled_model.predict(X_train[selected_feature_names])

array(['row', 'dead', 'rest', ..., 'dead', 'dead', 'dead'], dtype=object)

### 4)hyperparameter tuning


### 5) final evaluation


### 6)Interpretation of model parameters


### 7) final model


# Model deployment

---
