In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

### 1. Train model

#### 1.1. Describe data and split dataset

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data


def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

In [3]:
# load dataset
df = describe_dataset("./train.csv")

# Categorizing label
df.loc[df["label"] == "UP", "label"] = 0
df.loc[df["label"] == "DOWN", "label"] = 1

Headers: ['label', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v']
Number of rows: 1602 
Number of columns: 25

Labels: 
label
UP      834
DOWN    768
Name: count, dtype: int64

Missing values: False

Duplicate Rows : 0


In [4]:
sc = StandardScaler()

In [5]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Convert labels to numeric
train_df["label"] = train_df["label"].map({"UP": 0, "DOWN": 1})
test_df["label"] = test_df["label"].map({"UP": 0, "DOWN": 1})

# Split features and labels
X_train = train_df.drop(columns=["label"])
y_train = train_df["label"]

X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

# Apply Scaling (Fit on Train, Transform on Test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & Transform on Training
X_test_scaled = scaler.transform(X_test)        # Only Transform Test

# Save Scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Data preprocessing complete! Scaling applied.")


Data preprocessing complete! Scaling applied.


In [6]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", max_iter=1000),
    "SVM": SVC(class_weight="balanced", probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
}

best_accuracy = 0
best_model = None

# Train and evaluate models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)

    # Cross-validation
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring="accuracy")
    
    y_pred = model.predict(X_test_scaled)

    # Evaluate Performance
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred, zero_division=1)
    f1 = f1_score(y_test, y_pred, zero_division=1)

    print(f"Model: {name}")
    print(f"Cross-Validation Accuracy: {scores.mean():.4f}")
    print(f"Test Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")
    print("-" * 50)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# Save Best Model
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best Model Saved: best_model.pkl")


Training Logistic Regression...
Model: Logistic Regression
Cross-Validation Accuracy: 0.9819
Test Accuracy: 0.9906 | Precision: 0.9883 | Recall: 0.9922 | F1 Score: 0.9903
--------------------------------------------------
Training SVM...
Model: SVM
Cross-Validation Accuracy: 0.9632
Test Accuracy: 0.9888 | Precision: 0.9808 | Recall: 0.9961 | F1 Score: 0.9884
--------------------------------------------------
Training Random Forest...
Model: Random Forest
Cross-Validation Accuracy: 0.9750
Test Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1 Score: 1.0000
--------------------------------------------------
Best Model Saved: best_model.pkl


In [7]:
# Load Best Model & Scaler
with open("best_model.pkl", "rb") as f:
    model = pickle.load(f)
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

# Transform Test Data
X_test_scaled = scaler.transform(X_test)

# Make Predictions with Probability Threshold (Improve Recall)
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]  # Get probability scores
y_pred = (y_pred_prob > 0.35).astype(int)  # Adjust threshold from 0.5 to 0.4

# Evaluate Performance
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
cm = confusion_matrix(y_test, y_pred)

print("✅ Model Evaluation on Test Data:")
print(f" Accuracy: {acc:.4f}")
print(f" Precision: {prec:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")

# Save Evaluation Results
eval_results = pd.DataFrame([[acc, prec, recall, f1, cm]], 
                            columns=["Accuracy", "Precision", "Recall", "F1 Score", "Confusion Matrix"])
eval_results.to_csv("evaluation.csv", index=False)

print("Evaluation Results Saved: evaluation.csv")


✅ Model Evaluation on Test Data:
 Accuracy: 0.9994
 Precision: 0.9987
 Recall: 1.0000
 F1 Score: 0.9993
Confusion Matrix:
[[833   1]
 [  0 768]]
Evaluation Results Saved: evaluation.csv


#### 1.2. Train model using Scikit-learn

In [8]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC(probability=True)),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", CalibratedClassifierCV(SGDClassifier())),
         ("NB", GaussianNB()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))

# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,DTC,"[1.0, 1.0]",1.0,"[1.0, 1.0]","[1.0, 1.0]","[[834, 0], [0, 768]]"
1,RF,"[1.0, 1.0]",1.0,"[1.0, 1.0]","[1.0, 1.0]","[[834, 0], [0, 768]]"
2,KNN,"[0.985, 0.99]",0.986891,"[0.99, 0.983]","[0.987, 0.986]","[[826, 8], [13, 755]]"
3,SGDC,"[0.977, 0.957]",0.966916,"[0.959, 0.975]","[0.968, 0.966]","[[800, 34], [19, 749]]"
4,SVC,"[0.935, 0.977]",0.953808,"[0.98, 0.926]","[0.957, 0.951]","[[817, 17], [57, 711]]"
5,LR,"[0.924, 0.929]",0.926342,"[0.935, 0.917]","[0.93, 0.923]","[[780, 54], [64, 704]]"
6,NB,"[0.883, 0.658]",0.73221,"[0.56, 0.919]","[0.685, 0.767]","[[467, 367], [62, 706]]"


#### 1.3. Dump models pickle

In [9]:
with open("./model/all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

In [10]:
with open("./model/input_scaler.pkl", "wb") as f:
    pickle.dump(sc, f)