In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import warnings
warnings.filterwarnings("ignore")

In [None]:
from pandas.api.types import is_datetime64_ns_dtype

# Data

In [None]:
path='/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet'


# Feature Engineering

In [None]:
@staticmethod
def reduce_mem_usage(df):
  
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int32)  
            else:
                df[col] = df[col].astype(np.float16)
        
    return df

In [None]:
def feature_programming(df):
  
    #timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour']=df['timestamp'].dt.hour
    
    #timestamp
    df['series_id'] = df['series_id'].astype('category')
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
 

    #1)perform shift
    for i in [60, 360, 720, 3600]:
    #sma as 0th basic feature
        df['anglez_roll_'+str(i)]=df['anglez'].rolling(window=i,center=True).mean().bfill().ffill().astype('float16')
        df['enmo_roll_'+str(i)]=df['enmo'].rolling(window=i,center=True).mean().bfill().ffill().astype('float16')
    #2)perform difference
    #first difference
        df['anglez_momentum_'+str(i)]=(df['anglez']-df['anglez_roll_'+str(i)])
        df['enmo_momentum_'+str(i)]=df['enmo']-df['enmo_roll_'+str(i)]
        #for 1th order
        #1st basic feature
        df['ratio_ang_'+str(i)]=df['anglez_momentum_'+str(i)].div(df['anglez_roll_'+str(i)]).fillna(0).replace([np.inf,-np.inf],0).astype('float16')
        df['ratio_enm_'+str(i)]=df['enmo_momentum_'+str(i)].div(df['enmo_roll_'+str(i)]).fillna(0).replace([np.inf,-np.inf],0).astype('float16')
    
        #denoise
        df['anglez_max_'+str(i)]=df['anglez'].rolling(window=i).max().bfill().ffill().astype('float16')
        df['anglez_min_'+str(i)]=df['anglez'].rolling(window=i).min().bfill().ffill().astype('float16')
        df['enmo_max_'+str(i)]=df['enmo'].rolling(window=i).max().bfill().ffill().astype('float16')
        df['enmo_min_'+str(i)]=df['enmo'].rolling(window=i).min().bfill().ffill().astype('float16')
        
        
    
    gc.collect()
    
    return df

In [None]:

def feat_eng_by_id(idx，file):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    #分id读取
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df['awake'] = df['awake'].astype(np.int8)
    df = reduce_mem_usage(df)
    df = feature_programming(df)
    
    
    return df


In [None]:
series_id = pd.read_parquet('/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet', columns=['series_id'])
series_id = series_id.series_id.unique()
series_id = list(series_id)

In [None]:

from joblib import Parallel, delayed
from itertools import groupby

train=Parallel(n_jobs=6)(delayed(feat_eng_by_id)(i)for i in serise_id)

# model select

In [None]:
train.columns

In [None]:

y='awake'

# RF

In [None]:
import scipy
import cudf as cu
from sklearn.ensemble import RandomForestClassifier 
RF = RandomForestClassifier(n_estimators=1000,
                                    min_samples_leaf=300,
                                    random_state=42,n_jobs=-1)


importances = RF.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(train[X].shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))


In [None]:
#create a df to store feature‘s importance
#feature_importance=pd.DataFrame({'feature':train.columns,'importance':select.feature_importance_})

# XGBoost -gpu

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV


import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=12,4


#XGBoost models and cv
def modelfit(alg,x,y,useTrainCV=True,cv_folds=5,early_stopping_rounds=50):
  #params:alg
  if useTrainCV:
    xgb_param=alg.get_xgb_params()
    xgtrain=xgb.DMatrix(x.values,label=y.values)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cvresult.shape[0])
 
    #Fit the algorithm on the data
  alg.fit(x,y,eval_metric='auc')
 
#Predict training set:
  dtrain_predictions = alg.predict(x)
  dtrain_predprob = alg.predict_proba(x)[:,1]
 
#Print model report:
  print ("\nModel Report")
  print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, dtrain_predictions))
  print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
 
  feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
  feat_imp.plot(kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')

In [None]:
#learning rate and tree_based
#initialization

xgb=XGBClassifier(learning_rate=0.1,
                   n_estimators=1000,
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=0.8,
                   colsample_bytree=0.8,
                   objective='binary:logistic',
                   nthread=4,
                   scale_pos_weight=1,
                   seed=42,
                  gpu_id=0,
                 tree_method="gpu_hist")


# lightgbm

In [None]:
!pip install  --upgrade pyarrow

In [None]:
import lightgbm as lgb
lgb_opt =  {
    'num_leaves': 204,
    'learning_rate': 0.076,
    'random_state': 42,
     'device’':'gpu', 'gpu_platform_id':0, 'gpu_device_id':0
}

lgb = lgb.LGBMClassifier(**lgb_opt)

# stacking

In [None]:
!pip install mlxtend

In [None]:
git clone --recursive [https://github.com/dmlc/xgboost.git](https://github.com/dmlc/xgboost.git)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
lr=LogisticRegression()

sclf=StackingCVClassifier(classifiers=[RF,lgb,xgb],
                         meta_classifier=lr,
                         random_state=42)

#output
for clf,label in zip([RF,lgb,xgb,sclf],['RF','Lgb','XGBoost','stackingClassifier']):
    scores=cross_val_score(clf,train[X],train[y],cv=5,scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    

# output

In [None]:
from sklearn.metrics import roc_curve, auc
raw=test=pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
test=feature_programming(raw_test,delta_t=[30,90,120,360,480])
X_test = test[X]
y_test=test[y]
test["score"] = sclf.predict_proba(X_test)[:,1]
e"]

test["not_awake"] = 1-test["score"]
# exponential smoothing of the predictions
test["smooth"] = test["not_awake"].ewm(span = 100).mean()
# re-binarize
test["smooth"] = test["smooth"].round()


def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)


sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
sample_submission.to_csv('submission.csv', index=False)
