In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet
/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train_multi.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
import torch 
torch.cuda.is_available()

True

In [3]:
import gc
import warnings
warnings.filterwarnings("ignore")

In [4]:
from pandas.api.types import is_datetime64_ns_dtype

# Data

In [5]:
path='/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet'


# Feature Engineering

In [6]:
@staticmethod
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int32)  
            else:
                df[col] = df[col].astype(np.float16)
    
    return df

In [7]:
def feature_programming(df):
  
    #timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour']=df['timestamp'].dt.hour
    
    #timestamp
    df['series_id'] = df['series_id'].astype('category')
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
 

    #1)perform shift
    for i in [60, 360, 720, 3600]:
    #sma as 0th basic feature
        df['anglez_roll_'+str(i)]=df['anglez'].rolling(window=i,center=True).mean().bfill().ffill().astype('float16')
        
        df['enmo_roll_'+str(i)]=df['enmo'].rolling(window=i,center=True).mean().bfill().ffill().astype('float16')
    #2)perform difference
    #first difference
        df['anglez_momentum_'+str(i)]=df['anglez']-df['anglez_roll_'+str(i)]
        df['enmo_momentum_'+str(i)]=df['enmo']-df['enmo_roll_'+str(i)]
        
        #for 1th order
        #1st basic feature
        df['ratio_ang_'+str(i)]=df['anglez_momentum_'+str(i)].div(df['anglez_roll_'+str(i)]).replace([np.inf,-np.inf],0).fillna(0).astype('float32')
        df['ratio_enm_'+str(i)]=df['enmo_momentum_'+str(i)].div(df['enmo_roll_'+str(i)]).replace([np.inf,-np.inf],0).fillna(0).astype('float32')
    
        #denoise
        df['anglez_max_'+str(i)]=df['anglez'].rolling(window=i).max().bfill().ffill().astype('float16')
        df['anglez_min_'+str(i)]=df['anglez'].rolling(window=i).min().bfill().ffill().astype('float16')
        df['enmo_max_'+str(i)]=df['enmo'].rolling(window=i).max().bfill().ffill().astype('float16')
        df['enmo_min_'+str(i)]=df['enmo'].rolling(window=i).min().bfill().ffill().astype('float16')
        
        
    
    gc.collect()
    return df

In [8]:
def feat_eng_by_id(idx):
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    warnings.filterwarnings('ignore')
    
    #分id读取
    df  = pd.read_parquet(path, filters=[('series_id','=',idx)])
    df['awake'] = df['awake'].astype(np.int8)
    df = reduce_mem_usage(df)
    df = feature_programming(df)
    return df

In [9]:

series_id = pd.read_parquet('/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet',columns=['series_id'])
series_id = series_id.series_id.unique()
series_id = list(series_id)


In [10]:
%%time

from joblib import Parallel, delayed
from itertools import groupby

train=Parallel(n_jobs=6)(delayed(feat_eng_by_id)(i)for i in series_id)


CPU times: user 2.71 s, sys: 4.55 s, total: 7.26 s
Wall time: 3min 10s


In [11]:
train=pd.concat(train,ignore_index=True)

In [12]:
train=train.iloc[::60]

# model select

In [13]:
train.columns

Index(['series_id', 'step', 'anglez', 'enmo', 'awake', 'hour',
       'anglez_roll_60', 'enmo_roll_60', 'anglez_momentum_60',
       'enmo_momentum_60', 'ratio_ang_60', 'ratio_enm_60', 'anglez_max_60',
       'anglez_min_60', 'enmo_max_60', 'enmo_min_60', 'anglez_roll_360',
       'enmo_roll_360', 'anglez_momentum_360', 'enmo_momentum_360',
       'ratio_ang_360', 'ratio_enm_360', 'anglez_max_360', 'anglez_min_360',
       'enmo_max_360', 'enmo_min_360', 'anglez_roll_720', 'enmo_roll_720',
       'anglez_momentum_720', 'enmo_momentum_720', 'ratio_ang_720',
       'ratio_enm_720', 'anglez_max_720', 'anglez_min_720', 'enmo_max_720',
       'enmo_min_720', 'anglez_roll_3600', 'enmo_roll_3600',
       'anglez_momentum_3600', 'enmo_momentum_3600', 'ratio_ang_3600',
       'ratio_enm_3600', 'anglez_max_3600', 'anglez_min_3600', 'enmo_max_3600',
       'enmo_min_3600'],
      dtype='object')

In [14]:
X=train[['anglez', 'enmo', 'hour',
       'anglez_roll_60', 'enmo_roll_60', 'anglez_momentum_60',
       'enmo_momentum_60', 'ratio_ang_60', 'ratio_enm_60', 'anglez_max_60',
       'anglez_min_60', 'enmo_max_60', 'enmo_min_60', 'anglez_roll_360',
       'enmo_roll_360', 'anglez_momentum_360', 'enmo_momentum_360',
       'ratio_ang_360', 'ratio_enm_360', 'anglez_max_360', 'anglez_min_360',
       'enmo_max_360', 'enmo_min_360', 'anglez_roll_720', 'enmo_roll_720',
       'anglez_momentum_720', 'enmo_momentum_720', 'ratio_ang_720',
       'ratio_enm_720', 'anglez_max_720', 'anglez_min_720', 'enmo_max_720',
       'enmo_min_720', 'anglez_roll_3600', 'enmo_roll_3600',
       'anglez_momentum_3600', 'enmo_momentum_3600', 'ratio_ang_3600',
       'ratio_enm_3600', 'anglez_max_3600', 'anglez_min_3600', 'enmo_max_3600',
       'enmo_min_3600']]
y=train['awake']
gc.collect()

31

In [15]:
del train
gc.collect()

0

# RF

In [16]:
from sklearn.model_selection import cross_validate

In [17]:
#create a df to store feature‘s importance
#feature_importance=pd.DataFrame({'feature':train.columns,'importance':select.feature_importance_})
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=500, min_samples_leaf=300, random_state=42, n_jobs=-1)

# XGBoost 

In [18]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [19]:
xgb_params = {
    'n_estimators': 920,
    'objective': "binary:logistic",
    'learning_rate': 0.02,
    'max_depth': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'random_state': 42,
    'tree_method': 'gpu_hist'}

xgb=xgb.XGBClassifier(**xgb_params)

# lightgbm

In [20]:
!pip install  --upgrade pyarrow

Collecting pyarrow
  Obtaining dependency information for pyarrow from https://files.pythonhosted.org/packages/34/65/204f7c0d507056c37b56dddb3bd60f55744f2609c0f96a5e4ca91c67c42a/pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 11.0.0
    Uninstalling pyarrow-11.0.0:
      Successfully uninstalled pyarrow-11.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requir

In [21]:
import lightgbm as lgb
lgb_opt = {    
    'boosting_type': 'gbdt',
    'num_leaves': 131,
    'n_iter': 1500,'verbose': -1,'n_estimators': 850,
    'objective': 'l2','learning_rate': 0.05670084478292278, 'min_child_samples': 20,
    'colsample_bytree': 0.6440444070196796, 'colsample_bynode': 0.637635804565811, 
    'lambda_l1': 6.29090474401462, 'lambda_l2': 6.775341543233317, 'subsample': 0.9,
    'min_data_in_leaf': 95, 'max_depth': 39, 'max_bin': 630}

lgb = lgb.LGBMClassifier(**lgb_opt)

# stacking

In [22]:
!pip install mlxtend



In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
lr=LogisticRegression()


sclf=StackingCVClassifier(classifiers=[rf,lgb,xgb],
                         meta_classifier=lr,
                         random_state=42)

In [24]:
%%time
#output
for clf,label in zip([rf,lgb,xgb,sclf],['RF','Lgb','XGBoost','stackingClassifier']):
    scores=cross_val_score(clf,X,y,cv=5,scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    gc.collect()

Accuracy: 0.96 (+/- 0.01) [RF]
Accuracy: 0.97 (+/- 0.00) [Lgb]
Accuracy: 0.97 (+/- 0.00) [XGBoost]
Accuracy: 0.96 (+/- 0.00) [stackingClassifier]
CPU times: user 3h 14min 15s, sys: 34.9 s, total: 3h 14min 50s
Wall time: 59min 48s


In [25]:
sclf.fit(X,y)



# output

In [26]:
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI
#submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
#submission.to_csv('submission.csv', index=False)

In [27]:
cols_sub = ['series_id','step','event','score']

series_id  = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', columns=['series_id'])
series_id = series_id.series_id.unique()

tests = []
for idx in series_id:
    test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet',filters=[('series_id','=',idx)])
    test = feature_programming(test)
    

    X_test = test[['anglez', 'enmo', 'hour',
       'anglez_roll_60', 'enmo_roll_60', 'anglez_momentum_60',
       'enmo_momentum_60', 'ratio_ang_60', 'ratio_enm_60', 'anglez_max_60',
       'anglez_min_60', 'enmo_max_60', 'enmo_min_60', 'anglez_roll_360',
       'enmo_roll_360', 'anglez_momentum_360', 'enmo_momentum_360',
       'ratio_ang_360', 'ratio_enm_360', 'anglez_max_360', 'anglez_min_360',
       'enmo_max_360', 'enmo_min_360', 'anglez_roll_720', 'enmo_roll_720',
       'anglez_momentum_720', 'enmo_momentum_720', 'ratio_ang_720',
       'ratio_enm_720', 'anglez_max_720', 'anglez_min_720', 'enmo_max_720',
       'enmo_min_720', 'anglez_roll_3600', 'enmo_roll_3600',
       'anglez_momentum_3600', 'enmo_momentum_3600', 'ratio_ang_3600',
       'ratio_enm_3600', 'anglez_max_3600', 'anglez_min_3600', 'enmo_max_3600',
       'enmo_min_3600']]
    X_test = X_test.fillna(0)
    
    test.reset_index(drop=False,inplace=True)
    test = test[['series_id', 'step','timestamp']]
    
    test["probability"] = sclf.predict_proba(X_test)[:,1]
    

    test['prediction']=sclf.predict(X_test)
    test['prediction']=test['prediction'].rolling(360,center=True).median()
    
    #0-1 change
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    
    test['score'] = test['probability'].rolling(360, center=True, min_periods=10).mean().bfill().ffill()
    
    #?
    test['pred_diff'] = test['prediction'].diff()
    test['event']= test['pred_diff'].replace({1:'wakeup',-1:'onset',0:np.nan})
    
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')
    
    out = test[['series_id','step','event','score']]
    tests.append(out)
    

# submission

In [28]:
submission = pd.concat(tests, ignore_index=True).reset_index(names='row_id')
submission.to_csv('submission.csv', index=False)
submission



Unnamed: 0,row_id,series_id,step,event,score
