In [None]:
!pip install pycaret[full] cupy-cuda11x
!pip install rapidsai-cuml-cu11 --extra-index-url=https://pypi.nvidia.com

Collecting cupy-cuda11x
  Downloading cupy_cuda11x-13.4.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.2.0 (from pycaret[full])
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
[31mERROR: Could not find a version that satisfies the requirement rapidsai-cuml-cu11 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for rapidsai-cuml-cu11[0m[31m
[0m

In [None]:
from google.colab import userdata
import json

# Get the Kaggle credentials from Colab's userdata
username = userdata.get("KAGGLE_USER")
key = userdata.get("KAGGLE_KEY")

# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c superai5-pain-syndrome-classification
! unzip /content/superai5-pain-syndrome-classification.zip -d pain-syndrome

Downloading superai5-pain-syndrome-classification.zip to /content
  0% 0.00/2.42M [00:00<?, ?B/s]
100% 2.42M/2.42M [00:00<00:00, 111MB/s]
Archive:  /content/superai5-pain-syndrome-classification.zip
  inflating: pain-syndrome/X_test.pkl  
  inflating: pain-syndrome/X_train.pkl  
  inflating: pain-syndrome/sample_submission.csv  
  inflating: pain-syndrome/submission.csv  
  inflating: pain-syndrome/y_train.csv  


## Data Prep

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
y_train = pd.read_csv("/content/pain-syndrome/y_train.csv")
y_train = y_train.set_index('person_id')
y_train.head()

Unnamed: 0_level_0,pain_version
person_id,Unnamed: 1_level_1
S025,2
S089,0
S029,1
S048,1
S036,1


In [None]:
X_train = pd.read_pickle("/content/pain-syndrome/X_train.pkl")
X_train = X_train.explode(column=X_train.columns[1:].tolist())
X_train.sample(10)

Unnamed: 0_level_0,pain_intensity,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,Attention,Meditation
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
S070,1.0,80070.0,212231.0,24517.0,38322.0,7959.0,11625.0,12620.0,9582.0,35.0,64.0
S048,2.0,107186.0,42368.0,6824.0,4998.0,7073.0,7830.0,11694.0,3345.0,61.0,43.0
S048,2.0,153051.0,69656.0,3223.0,9326.0,36297.0,11788.0,8948.0,4394.0,84.0,37.0
S075,5.0,1418374.0,70993.0,113889.0,181539.0,69753.0,21479.0,7893.0,6446.0,38.0,87.0
S048,2.0,309406.0,103843.0,6966.0,54422.0,11769.0,19714.0,9798.0,7271.0,30.0,57.0
S013,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S074,3.0,101126.0,92013.0,21894.0,25379.0,15500.0,14065.0,5759.0,5976.0,29.0,64.0
S029,3.0,139716.0,89729.0,21886.0,5457.0,21822.0,17700.0,18644.0,10578.0,81.0,56.0
S064,3.0,50405.0,46137.0,18282.0,22013.0,8043.0,7626.0,8949.0,2627.0,40.0,51.0
S097,3.0,55566.0,12248.0,13915.0,3048.0,3853.0,1929.0,1347.0,1675.0,34.0,78.0


## Preprocessing

In [None]:
X_train['pain_intensity'] = X_train['pain_intensity'].astype('category')
obj_col = X_train.select_dtypes(include='object').columns

for label in obj_col:
  X_train[label] = X_train[label].astype('float')

X_train = X_train.replace(0, np.nan).ffill(axis=0)
X_train.reset_index(inplace=True)
X_train['pain_version'] = X_train['person_id'].map(y_train['pain_version'])
X_train.drop(columns='person_id', inplace=True)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87186 entries, 0 to 87185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   pain_intensity  87186 non-null  category
 1   Delta           87186 non-null  float64 
 2   Theta           87186 non-null  float64 
 3   Alpha1          87186 non-null  float64 
 4   Alpha2          87186 non-null  float64 
 5   Beta1           87186 non-null  float64 
 6   Beta2           87186 non-null  float64 
 7   Gamma1          87186 non-null  float64 
 8   Gamma2          87186 non-null  float64 
 9   Attention       87186 non-null  float64 
 10  Meditation      87186 non-null  float64 
 11  pain_version    87186 non-null  int64   
dtypes: category(1), float64(10), int64(1)
memory usage: 7.4 MB


In [None]:
label_count = pd.crosstab(X_train['pain_version'], columns='N')
label_count

col_0,N
pain_version,Unnamed: 1_level_1
0,35667
1,29062
2,7926
3,14531


In [None]:
downsample = label_count['N'].min()

df_prep = X_train[ X_train['pain_version'] == 2 ]
for label in [0, 1, 3]:
    temp = X_train[ X_train['pain_version'] == label ].sample(downsample)
    df_prep = pd.concat([df_prep, temp])
df_prep['pain_version'].value_counts()

Unnamed: 0_level_0,count
pain_version,Unnamed: 1_level_1
2,7926
0,7926
1,7926
3,7926


## AutoML

In [None]:
import torch
from pycaret.classification import *

classification_setup = setup(data=df_prep, target='pain_version',
                             train_size=0.8, session_id=42, use_gpu=torch.cuda.is_available())

best_model = compare_models(sort='Accuracy', include=['dt', 'rf', 'et', 'gbc', 'xgboost', 'lightgbm', 'catboost'])
best_model = create_model(best_model)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,42
1,Target,pain_version
2,Target type,Multiclass
3,Original data shape,"(31704, 12)"
4,Transformed data shape,"(31704, 16)"
5,Transformed train set shape,"(25363, 16)"
6,Transformed test set shape,"(6341, 16)"
7,Numeric features,10
8,Categorical features,1
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7343,0.9125,0.7343,0.7256,0.7289,0.6457,0.6463,2.018
catboost,CatBoost Classifier,0.7341,0.9142,0.7341,0.7247,0.7281,0.6454,0.6462,7.491
rf,Random Forest Classifier,0.7337,0.9141,0.7337,0.7248,0.7282,0.6449,0.6455,4.912
lightgbm,Light Gradient Boosting Machine,0.7309,0.9142,0.7309,0.7213,0.7249,0.6412,0.6419,1.131
xgboost,Extreme Gradient Boosting,0.7299,0.9125,0.7299,0.7208,0.7241,0.6399,0.6407,0.909
gbc,Gradient Boosting Classifier,0.7287,0.0,0.7287,0.7215,0.7243,0.6382,0.6386,33.691
dt,Decision Tree Classifier,0.6605,0.7743,0.6605,0.6624,0.6613,0.5473,0.5474,0.603


Processing:   0%|          | 0/33 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7414,0.9185,0.7414,0.7304,0.7346,0.6552,0.656
1,0.7363,0.9092,0.7363,0.7258,0.7298,0.6484,0.6491
2,0.7473,0.9155,0.7473,0.7385,0.742,0.6631,0.6637
3,0.7256,0.9131,0.7256,0.7182,0.7209,0.6341,0.6346
4,0.7311,0.9133,0.7311,0.7237,0.7269,0.6414,0.6417
5,0.7228,0.9078,0.7228,0.7172,0.719,0.6304,0.6309
6,0.735,0.9141,0.735,0.7264,0.7298,0.6467,0.6473
7,0.7342,0.9111,0.7342,0.7241,0.7279,0.6456,0.6464
8,0.7287,0.9102,0.7287,0.7191,0.7223,0.6383,0.6392
9,0.7401,0.9122,0.7401,0.733,0.7361,0.6535,0.6538


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_model = tune_model(best_model, optimize='F1', search_library='optuna')
final_model = finalize_model(tuned_model)
evaluate_model(final_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7335,0.9153,0.7335,0.7231,0.7271,0.6447,0.6455
1,0.7355,0.9095,0.7355,0.7255,0.7293,0.6474,0.648
2,0.7304,0.9128,0.7304,0.7208,0.7244,0.6405,0.6413
3,0.7212,0.9124,0.7212,0.7113,0.7154,0.6283,0.6288
4,0.7161,0.9124,0.7161,0.7069,0.7106,0.6215,0.622
5,0.709,0.9074,0.709,0.7018,0.7044,0.612,0.6125
6,0.7259,0.9111,0.7259,0.7155,0.7196,0.6346,0.6353
7,0.7181,0.9087,0.7181,0.7071,0.7116,0.6241,0.6247
8,0.7232,0.907,0.7232,0.7127,0.717,0.6309,0.6315
9,0.733,0.9094,0.733,0.7252,0.7284,0.6441,0.6445


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

INFO:optuna_integration.sklearn.sklearn:Searching the best hyperparameters using 25363 samples...
INFO:optuna_integration.sklearn.sklearn:Finished hyperparameter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Prediction

In [None]:
X_test = pd.read_pickle("/content/pain-syndrome/X_test.pkl")
X_test = X_test.explode(column=X_test.columns[1:].tolist())
order_index = X_test.index.unique().tolist()
X_test.head()

Unnamed: 0_level_0,pain_intensity,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,Attention,Meditation
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
S021,2.0,9853.0,34478.0,4849.0,4705.0,8270.0,7846.0,4019.0,2831.0,48.700001,59.900002
S021,2.0,7177.0,13943.0,3988.0,9496.0,5530.0,11398.0,3555.0,2754.0,48.700001,59.900002
S021,2.0,10029.0,11892.0,2827.0,5288.0,8392.0,8381.0,4692.0,2556.0,48.700001,59.900002
S021,2.0,1926.0,18883.0,6338.0,8712.0,3870.0,4947.0,4444.0,2167.0,30.0,51.0
S021,2.0,77616.0,73244.0,24021.0,7529.0,10345.0,17467.0,8360.0,2746.0,27.0,54.0


In [None]:
from statistics import mode

test_pred = predict_model(best_model, X_test.reset_index(drop=True))['prediction_label']

y_pred = []
start, end = 0, 1321
for _ in range(len(test_pred)//1321):
    y_pred.append( mode(test_pred[start:end].tolist()) )
    start += 1321
    end += 1321

y_pred

[0, 3, 0, 1, 0, 3, 0, 3, 3, 1, 2, 2, 0, 1, 2, 1, 0]

In [None]:
submit = {}
for key, val in zip(order_index, y_pred):
  submit[key] = val
submit

{'S021': 0,
 'S093': 3,
 'S066': 0,
 'S056': 1,
 'S032': 0,
 'S046': 3,
 'S027': 0,
 'S031': 3,
 'S012': 3,
 'S037': 1,
 'S034': 2,
 'S091': 2,
 'S079': 0,
 'S042': 1,
 'S026': 2,
 'S099': 1,
 'S088': 0}

## Submission

In [None]:
submission = pd.read_csv('/content/pain-syndrome/submission.csv')
submission['pain_version'] = submission['person_id'].map(submit)
submission

Unnamed: 0,person_id,pain_version
0,S021,0
1,S093,3
2,S066,0
3,S056,1
4,S032,0
5,S046,3
6,S027,0
7,S031,3
8,S012,3
9,S037,1


In [None]:
submission.to_csv('pycaret_2.csv', index=False)