# 데이터 로드

In [None]:
import pandas as pd

df = pd.read_csv('/data/dataset.csv')
display(df.head())

: 

# 데이터 분석

### Distribution of `gaze_lr` by `label`

In [None]:
display(df.groupby('label')['gaze_lr'].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,gaze_lr,Unnamed: 2_level_1
0,CENTER,1938
0,LEFT,201
0,RIGHT,15
1,CENTER,1400
1,LEFT,574
1,RIGHT,397


### Distribution of `gaze_ud` by `label`

In [None]:
display(df.groupby('label')['gaze_ud'].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,gaze_ud,Unnamed: 2_level_1
0,CENTER,1834
0,UP,287
0,DOWN,33
1,CENTER,1437
1,UP,803
1,DOWN,131


### Distribution of `final_lr` by `label`

In [None]:
display(df.groupby('label')['final_lr'].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,final_lr,Unnamed: 2_level_1
0,CENTER,1686
0,LEFT,406
0,RIGHT,62
1,CENTER,978
1,RIGHT,714
1,LEFT,679


### Distribution of `final_ud` by `label`

In [None]:
display(df.groupby('label')['final_ud'].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,final_ud,Unnamed: 2_level_1
0,CENTER,1834
0,UP,287
0,DOWN,33
1,CENTER,1437
1,UP,803
1,DOWN,131


# 데이터 전처리

In [None]:
df = df.drop(columns=['timestamp'])
display(df.head())

Unnamed: 0,pitch,yaw,roll,gaze_lr,gaze_ud,final_lr,final_ud,face_count,face_visible,label,video_id
0,-178.170501,-19.953031,-3.651417,CENTER,CENTER,CENTER,CENTER,1,1,0,normal_1
1,-178.707109,-19.018401,-3.676718,CENTER,UP,CENTER,UP,1,1,0,normal_1
2,-178.655547,-18.443058,-3.712478,CENTER,CENTER,CENTER,CENTER,1,1,0,normal_1
3,-178.790434,-19.79238,-3.44509,CENTER,CENTER,CENTER,CENTER,1,1,0,normal_1
4,-179.312939,-19.479203,-3.524443,CENTER,CENTER,CENTER,CENTER,1,1,0,normal_1


In [None]:
df = df.drop(columns=['video_id'])

In [None]:
df.head(5)

Unnamed: 0,pitch,yaw,roll,gaze_lr,gaze_ud,final_lr,final_ud,face_count,face_visible,label
0,-178.170501,-19.953031,-3.651417,CENTER,CENTER,CENTER,CENTER,1,1,0
1,-178.707109,-19.018401,-3.676718,CENTER,UP,CENTER,UP,1,1,0
2,-178.655547,-18.443058,-3.712478,CENTER,CENTER,CENTER,CENTER,1,1,0
3,-178.790434,-19.79238,-3.44509,CENTER,CENTER,CENTER,CENTER,1,1,0
4,-179.312939,-19.479203,-3.524443,CENTER,CENTER,CENTER,CENTER,1,1,0


# train/test 데이터 분리

In [None]:
# 1. 랜덤 고정
import os
import numpy as np
import random
import torch

def reset_seeds(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
  np.random.seed(seed)
  torch.manual_seed(seed) # cpu 연산 무작위 고정
  torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
  torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )


In [None]:
reset_seeds()
# train dataset의 사망과 생존의 데이터 분포
new_survived = pd.Categorical(df["label"]) # object형-> category 타입으로 변환
new_survived = new_survived.rename_categories(["normal","cheat"]) # 0 -> Died, 1 -> Survived 로 변환

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
normal,2154,0.476022
cheat,2371,0.523978


In [None]:
from sklearn.model_selection import train_test_split

reset_seeds()

# 1. target 분리
y = df['label'] # target
X = df.drop(['label'], axis=1) # feature

# 2. train/test 분리 (7:3 으로 분리)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=df['label'])
# --> stratify=ori_train['survived'] : 분리할 때, 생존자/죽은자 분포를 원래 데이터와 비슷하게 나누는 것
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((3620, 9), (905, 9), (3620,), (905,))

In [None]:
reset_seeds()
train = X_tr.copy()
test = X_te.copy()

# 인코딩

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train.columns

Index(['pitch', 'yaw', 'roll', 'gaze_lr', 'gaze_ud', 'final_lr', 'final_ud',
       'face_count', 'face_visible'],
      dtype='object')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3620 entries, 3382 to 2445
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pitch         3620 non-null   float64
 1   yaw           3620 non-null   float64
 2   roll          3620 non-null   float64
 3   gaze_lr       3620 non-null   object 
 4   gaze_ud       3620 non-null   object 
 5   final_lr      3620 non-null   object 
 6   final_ud      3620 non-null   object 
 7   face_count    3620 non-null   int64  
 8   face_visible  3620 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 282.8+ KB


In [None]:
enc_cols = ['gaze_lr', 'gaze_ud', "final_ud", "final_lr"]
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['face_count', 'roll', 'yaw', 'face_visible', 'pitch']

In [None]:
reset_seeds()
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (3620, 9) / (905, 9)
after: (3620, 17) / (905, 17)


Unnamed: 0,face_count,roll,yaw,face_visible,pitch,gaze_lr_CENTER,gaze_lr_LEFT,gaze_lr_RIGHT,gaze_ud_CENTER,gaze_ud_DOWN,gaze_ud_UP,final_ud_CENTER,final_ud_DOWN,final_ud_UP,final_lr_CENTER,final_lr_LEFT,final_lr_RIGHT
0,1,-4.030438,25.083932,1,-175.966953,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1,1.099981,-0.983234,1,-167.776439,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1,1.249666,-18.289653,1,-170.645629,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1,-167.292835,59.060206,1,-168.810316,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1,-8.647661,-36.970032,1,-167.904483,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# Training

## RandomForestClassifier

In [None]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum()

(np.int64(0), np.int64(0))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=16,
    min_samples_split=4,
    random_state=42,
    n_jobs=-1,
)

forest=clf.fit(enc_tr, y_tr)

In [None]:
print(f'훈련용 평가지표: {forest.score(enc_tr, y_tr)} / 테스트용 평가지표: {forest.score(enc_te, y_te)}')

훈련용 평가지표: 0.9820441988950276 / 테스트용 평가지표: 0.8795580110497238


### 분석

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = forest.predict(enc_te)

print(confusion_matrix(y_te, y_pred))
print(classification_report(y_te, y_pred))

[[390  41]
 [ 68 406]]
              precision    recall  f1-score   support

           0       0.85      0.90      0.88       431
           1       0.91      0.86      0.88       474

    accuracy                           0.88       905
   macro avg       0.88      0.88      0.88       905
weighted avg       0.88      0.88      0.88       905



## HistGradientBoostingClassifier

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

gb = HistGradientBoostingClassifier(
    max_depth=5,          # 너무 크지 않게
    learning_rate=0.1,
    max_iter=300,
    random_state=42,
)

gb.fit(enc_tr, y_tr)

print("GB train:", gb.score(enc_tr, y_tr))
print("GB test :", gb.score(enc_te, y_te))

GB train: 0.9715469613259669
GB test : 0.8861878453038674


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = gb.predict(enc_te)

print(confusion_matrix(y_te, y_pred))
print(classification_report(y_te, y_pred))

[[393  38]
 [ 65 409]]
              precision    recall  f1-score   support

           0       0.86      0.91      0.88       431
           1       0.91      0.86      0.89       474

    accuracy                           0.89       905
   macro avg       0.89      0.89      0.89       905
weighted avg       0.89      0.89      0.89       905



# export

In [None]:
import joblib
joblib.dump(gb, "anti_cheat_hgb.pkl")

['anti_cheat_hgb.pkl']