In [1]:
# @title Setup

competition = "Human Activity Recognition"  # @param
# @markdown ---

from google.colab import userdata
import json

# Get the Kaggle credentials from Colab's userdata
username = userdata.get("KAGGLE_USER")
key = userdata.get("KAGGLE_KEY")

# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

competition_id = "super-ai-engineer-5-human-activity-recognition"
!kaggle competitions download -c {competition_id}
!unzip /content/{competition_id}.zip

Archive:  /content/super-ai-engineer-5-human-activity-recognition.zip
  inflating: HAR/test/000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv  
  inflating: HAR/test/00830c34-f50e-4213-87dd-84235401ea61.csv  
  inflating: HAR/test/00944d21-19f4-4783-a7c4-69a9d063c8dd.csv  
  inflating: HAR/test/00ae23cd-1c86-465d-b108-e82b8267906e.csv  
  inflating: HAR/test/018b1306-c9b3-4a12-b393-ace3ec021e77.csv  
  inflating: HAR/test/026eff1c-8dc9-4907-8cc6-32fcaa28e329.csv  
  inflating: HAR/test/029b25d3-e91d-4e3f-91fb-70e2b29025da.csv  
  inflating: HAR/test/02cf5d17-63a1-4438-ae29-5c56e5bc3b86.csv  
  inflating: HAR/test/0301a2a7-34d3-4d76-8b4a-942eef98fade.csv  
  inflating: HAR/test/0328e96c-bb7e-42b4-8951-d5205c981dcc.csv  
  inflating: HAR/test/0363973d-c3f1-4fb4-bb31-5ac5d7bbcdfd.csv  
  inflating: HAR/test/041b7c07-a882-44d3-ab0a-e57ec40953c9.csv  
  inflating: HAR/test/04635ce0-0337-4cae-868f-e791994e05a5.csv  
  inflating: HAR/test/04653d7a-867f-4668-8e99-88a5e3c793e8.csv  
  inflating: HAR/tes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from tqdm import tqdm

In [2]:
def prep_dataframe(directory):
    df_prep = None
    for label in tqdm(np.sort(os.listdir(directory))):
        files = np.sort(os.listdir(os.path.join(directory, label)))

        for fname in files:
            filepath = os.path.join(directory, label, fname)
            df = pd.read_csv(filepath)
            df['class'] = label
            df_prep = pd.concat([df, df_prep]) if df_prep is not None else df

    return df_prep

train_df = prep_dataframe('/content/HAR/train')
train_df = train_df.reset_index(drop=True)
print(train_df.shape)
train_df.head()

100%|██████████| 9/9 [00:02<00:00,  3.26it/s]

(650644, 7)





Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,class
0,1.882227,1.043709,-1.654054,0.062253,-0.486484,0.024318,C09
1,1.93033,1.046023,-1.638919,0.070935,-0.443838,0.02598,C09
2,1.965982,1.008996,-1.638919,0.079064,-0.420371,0.026708,C09
3,2.004465,0.995111,-1.636757,0.087191,-0.408412,0.025773,C09
4,2.025969,0.955769,-1.667027,0.093842,-0.400063,0.023902,C09


In [27]:
pd.crosstab(train_df['class'], 'N')

col_0,N
class,Unnamed: 1_level_1
C01,92900
C02,80200
C03,59200
C04,95100
C05,58932
C06,85200
C07,70200
C08,68300
C09,40612


In [28]:
df_prep = None
for label in train_df['class'].unique():
    filtered_df = train_df[ train_df['class'] == label ]

    if len(filtered_df) > 60000:
        filtered_df = filtered_df.sample(n=60000, random_state=42)

    df_prep = pd.concat([filtered_df, df_prep]) if df_prep is not None else filtered_df

df_prep.reset_index(drop=True, inplace=True)
df_prep.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,class
0,7.55921,0.108539,2.182781,0.0,-0.003246,-0.001121,C01
1,8.169146,0.441556,0.176094,-0.000119,0.005903,0.0,C01
2,7.551211,0.122106,2.160441,0.000119,0.010034,-0.000897,C01
3,6.153358,0.270114,-3.021201,-0.000237,-0.003246,0.0,C01
4,7.539212,0.12704,2.214321,-0.000474,-0.006493,-0.000897,C01


## Model Selection

In [70]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting numpy<1.27,>=1.21 (from pycaret[full])
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<2.2.0 (from pycaret[full])
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2K 

In [30]:
from pycaret.classification import *
import torch

clf_setup = setup(df_prep, target='class', session_id=42,
                  train_size=0.85, use_gpu=torch.cuda.is_available())
best_model = compare_models(sort='Accuracy', include=['knn', 'dt', 'lightgbm', 'xgboost',])
best_model = create_model(best_model)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,42
1,Target,class
2,Target type,Multiclass
3,Target mapping,"C01: 0, C02: 1, C03: 2, C04: 3, C05: 4, C06: 5, C07: 6, C08: 7, C09: 8"
4,Original data shape,"(518744, 7)"
5,Transformed data shape,"(518744, 7)"
6,Transformed train set shape,"(440932, 7)"
7,Transformed test set shape,"(77812, 7)"
8,Numeric features,6
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9831,0.9998,0.9831,0.9832,0.9831,0.981,0.981,4.75
lightgbm,Light Gradient Boosting Machine,0.9816,0.9997,0.9816,0.9816,0.9816,0.9793,0.9793,27.704
dt,Decision Tree Classifier,0.9643,0.9799,0.9643,0.9644,0.9643,0.9598,0.9598,6.428
knn,K Neighbors Classifier,0.9562,0.9914,0.9562,0.9564,0.9562,0.9506,0.9506,2.49


Processing:   0%|          | 0/21 [00:00<?, ?it/s]

[2025-03-22 10:30:16.152] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization


  .applymap(highlight_cols, subset=["TT (Sec)"])


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9833,0.9998,0.9833,0.9833,0.9833,0.9811,0.9811
1,0.9836,0.9998,0.9836,0.9836,0.9836,0.9815,0.9815
2,0.9823,0.9998,0.9823,0.9823,0.9823,0.9801,0.9801
3,0.9838,0.9998,0.9838,0.9838,0.9838,0.9818,0.9818
4,0.9829,0.9997,0.9829,0.983,0.9829,0.9808,0.9808
5,0.9824,0.9997,0.9824,0.9824,0.9824,0.9802,0.9802
6,0.9837,0.9998,0.9837,0.9837,0.9837,0.9816,0.9816
7,0.9838,0.9998,0.9838,0.9838,0.9838,0.9818,0.9818
8,0.9831,0.9998,0.9831,0.9831,0.9831,0.981,0.981
9,0.9824,0.9997,0.9824,0.9825,0.9824,0.9802,0.9802


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [31]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Test set

In [8]:
test_dir = '/content/HAR/test/'

test_df = None
for file in np.sort(os.listdir(test_dir)):
    filepath = os.path.join(test_dir, file)
    df = pd.read_csv(filepath)
    df['file_id'] = file

    test_df = pd.concat([test_df, df]) if test_df is not None else df

test_df.reset_index(drop=True, inplace=True)
test_df.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,file_id
0,4.91819,0.923496,-0.093004,0.193295,-0.031665,-0.031503,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv
1,4.865548,0.98957,-0.091665,0.20391,-0.032893,-0.035193,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv
2,4.819174,1.041814,-0.09329,0.215411,-0.035249,-0.040684,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv
3,4.794107,1.095595,-0.094055,0.218507,-0.038367,-0.046624,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv
4,4.781573,1.135547,-0.094342,0.215411,-0.042662,-0.051845,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv


In [32]:
holdout_pred = predict_model(best_model, test_df.drop(columns='file_id'))
test_df['pred'] = holdout_pred['prediction_label']
test_df.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,file_id,pred
0,4.91819,0.923496,-0.093004,0.193295,-0.031665,-0.031503,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
1,4.865548,0.98957,-0.091665,0.20391,-0.032893,-0.035193,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
2,4.819174,1.041814,-0.09329,0.215411,-0.035249,-0.040684,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
3,4.794107,1.095595,-0.094055,0.218507,-0.038367,-0.046624,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
4,4.781573,1.135547,-0.094342,0.215411,-0.042662,-0.051845,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07


In [33]:
submit_df = test_df.groupby('file_id')['pred'].agg(lambda x: x.value_counts(ascending=False).index[0])
submit_df.head()

Unnamed: 0_level_0,pred
file_id,Unnamed: 1_level_1
000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
00830c34-f50e-4213-87dd-84235401ea61.csv,C03
00944d21-19f4-4783-a7c4-69a9d063c8dd.csv,C07
00ae23cd-1c86-465d-b108-e82b8267906e.csv,C07
018b1306-c9b3-4a12-b393-ace3ec021e77.csv,C05


## Submission

In [34]:
submission = pd.read_csv('/content/sample_submission.csv')
submission.loc[3:, 'class'] = submission.loc[3:, 'id'].map(submit_df)
submission.head()

Unnamed: 0,id,class
0,000ba64d-bc61-4e0f-b1f1-87cf4a8d97b3.csv,C07
1,00830c34-f50e-4213-87dd-84235401ea61.csv,C03
2,00944d21-19f4-4783-a7c4-69a9d063c8dd.csv,C07
3,00ae23cd-1c86-465d-b108-e82b8267906e.csv,C07
4,018b1306-c9b3-4a12-b393-ace3ec021e77.csv,C05


In [37]:
submission.to_csv('HAR_xgboost_balanced.csv', index=False)