In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")
train_labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")

In [None]:
def downcastMemoryUsage(dataFrame):
    startMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is: \033[1m{:.2f} MB\033[0m'.format(startMemoryOptimization))
    subTypeInt = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
    subTypeFloat = ['float16','float32','float64']
    for column in dataFrame.columns:
        columnType = str(dataFrame[column].dtypes)
        maximumColumn = dataFrame[column].max()
        minimumColumn = dataFrame[column].min()
        if 'int' in columnType:
            for element in subTypeInt:
                if minimumColumn > np.iinfo(element).min and maximumColumn < np.iinfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'float' in columnType:
            for element in subTypeFloat:
                if minimumColumn > np.finfo(element).min and maximumColumn < np.finfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'object' in columnType:
            numberOfUnique = len(dataFrame[column].unique())
            numberOfTotal = len(dataFrame[column])
            if numberOfUnique / numberOfTotal < 0.5:
                dataFrame[column] = dataFrame[column].astype('category')
    endMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: \033[1m{:.2f} MB\033[0m'.format(endMemoryOptimization))
    print('Compressed by: \033[1m{:.2f} %\033[0m'.format(100*(startMemoryOptimization - endMemoryOptimization) / startMemoryOptimization))
    return dataFrame

In [None]:
train =downcastMemoryUsage(train)
test=downcastMemoryUsage(test)

In [None]:
test.head()

In [None]:
train.isnull().sum()

In [None]:
from sklearn.preprocessing import QuantileTransformer
from scipy.signal import wiener


cols = [col for col in train.columns if 'sensor_' in col]
q_cols = []
wiener_cols = []

for col in cols:
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = pd.concat([train, test])[col].values.reshape(vec_len+vec_len_test, 1)
    transformer = QuantileTransformer(n_quantiles = 9, random_state = 42, output_distribution = "normal")
    transformer.fit(raw_vec)
    train[col+'_q'] = transformer.transform(train[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test[col+'_q'] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    train[col+'_wiefilt'] = wiener(train[col])
    test[col+'_wiefilt'] = wiener(test[col])
    wiener_cols.append(col+'_wiefilt')
    q_cols.append(col+'_q')

In [None]:
train.info()

In [None]:
test.isnull().sum()

In [None]:
test['sensor_12_wiefilt'] = test["sensor_12_wiefilt"].fillna(test["sensor_12_wiefilt"].median())
test['sensor_11_wiefilt'] = test["sensor_11_wiefilt"].fillna(test["sensor_11_wiefilt"].median())
test['sensor_08_wiefilt'] = test["sensor_08_wiefilt"].fillna(test["sensor_08_wiefilt"].median())
test['sensor_06_wiefilt'] = test["sensor_06_wiefilt"].fillna(test["sensor_06_wiefilt"].median())
test['sensor_00_wiefilt'] = test["sensor_00_wiefilt"].fillna(test["sensor_00_wiefilt"].median())
test['sensor_01_wiefilt'] = test["sensor_01_wiefilt"].fillna(test["sensor_01_wiefilt"].median())
test['sensor_03_wiefilt'] = test["sensor_03_wiefilt"].fillna(test["sensor_03_wiefilt"].median())
test['sensor_07_wiefilt'] = test["sensor_07_wiefilt"].fillna(test["sensor_07_wiefilt"].median())
test['sensor_09_wiefilt'] = test["sensor_09_wiefilt"].fillna(test["sensor_09_wiefilt"].median())

In [None]:
train['sensor_12_wiefilt'] = train["sensor_12_wiefilt"].fillna(train["sensor_12_wiefilt"].median())
train['sensor_11_wiefilt'] = train["sensor_11_wiefilt"].fillna(train["sensor_11_wiefilt"].median())
train['sensor_08_wiefilt'] = train["sensor_08_wiefilt"].fillna(train["sensor_08_wiefilt"].median())
train['sensor_06_wiefilt'] = train["sensor_06_wiefilt"].fillna(train["sensor_06_wiefilt"].median())
train['sensor_00_wiefilt'] = train["sensor_00_wiefilt"].fillna(train["sensor_00_wiefilt"].median())
train['sensor_01_wiefilt'] = train["sensor_01_wiefilt"].fillna(train["sensor_01_wiefilt"].median())
train['sensor_03_wiefilt'] = train["sensor_03_wiefilt"].fillna(train["sensor_03_wiefilt"].median())
train['sensor_07_wiefilt'] = train["sensor_07_wiefilt"].fillna(train["sensor_07_wiefilt"].median())
train['sensor_09_wiefilt'] = train["sensor_09_wiefilt"].fillna(train["sensor_09_wiefilt"].median())

In [None]:
train["s0-12"] = train["sensor_00"] - train["sensor_00"].shift(periods=12, fill_value=0)
train["s1-12"] = train["sensor_01"] - train["sensor_01"].shift(periods=12, fill_value=0)
train["s2-12"] = train["sensor_02"] - train["sensor_02"].shift(periods=12, fill_value=0)
train["s3-12"] = train["sensor_03"] - train["sensor_03"].shift(periods=12, fill_value=0)
train["s4-12"] = train["sensor_04"] - train["sensor_04"].shift(periods=12, fill_value=0)
train["s5-12"] = train["sensor_05"] - train["sensor_05"].shift(periods=12, fill_value=0)
train["s6-12"] = train["sensor_06"] - train["sensor_06"].shift(periods=12, fill_value=0)
train["s7-12"] = train["sensor_07"] - train["sensor_07"].shift(periods=12, fill_value=0)
train["s8-12"] = train["sensor_08"] - train["sensor_08"].shift(periods=12, fill_value=0)
train["s9-12"] = train["sensor_09"] - train["sensor_09"].shift(periods=12, fill_value=0)
train["s10-12"] = train["sensor_10"] - train["sensor_10"].shift(periods=12, fill_value=0)
train["s11-12"] = train["sensor_11"] - train["sensor_11"].shift(periods=12, fill_value=0)

In [None]:
test["s0-12"] = test["sensor_00"] - test["sensor_00"].shift(periods=12, fill_value=0)
test["s1-12"] = test["sensor_01"] - test["sensor_01"].shift(periods=12, fill_value=0)
test["s2-12"] = test["sensor_02"] - test["sensor_02"].shift(periods=12, fill_value=0)
test["s3-12"] = test["sensor_03"] - test["sensor_03"].shift(periods=12, fill_value=0)
test["s4-12"] = test["sensor_04"] - test["sensor_04"].shift(periods=12, fill_value=0)
test["s5-12"] = test["sensor_05"] - test["sensor_05"].shift(periods=12, fill_value=0)
test["s6-12"] = test["sensor_06"] - test["sensor_06"].shift(periods=12, fill_value=0)
test["s7-12"] = test["sensor_07"] - test["sensor_07"].shift(periods=12, fill_value=0)
test["s8-12"] = test["sensor_08"] - test["sensor_08"].shift(periods=12, fill_value=0)
test["s9-12"] = test["sensor_09"] - test["sensor_09"].shift(periods=12, fill_value=0)
test["s10-12"] = test["sensor_10"] - test["sensor_10"].shift(periods=12, fill_value=0)
test["s11-12"] = test["sensor_11"] - test["sensor_11"].shift(periods=12, fill_value=0)

In [None]:
def aggregated_features(df, aggregation_cols = ['sequence'], prefix = ''):
    agg_strategy = {'sensor_00': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_01': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_02': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_03': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_04': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_05': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_06': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_07': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_08': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_09': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_10': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_11': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_12': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                   }
    group = df.groupby(aggregation_cols).aggregate(agg_strategy)
    group.columns = ['_'.join(col).strip() for col in group.columns]
    group.columns = [str(prefix) + str(col) for col in group.columns]
    group.reset_index(inplace = True)
    
    temp = (df.groupby(aggregation_cols).size().reset_index(name = str(prefix) + 'size'))
    group = pd.merge(temp, group, how = 'left', on = aggregation_cols,)
    return group

In [None]:
trn_merge_data = aggregated_features(train, aggregation_cols = ['sequence', 'subject'])
tst_merge_data = aggregated_features(test, aggregation_cols = ['sequence', 'subject'])

In [None]:
trn_subjects_merge_data = aggregated_features(train, aggregation_cols = ['subject'], prefix = 'subject_')
tst_subjects_merge_data = aggregated_features(test, aggregation_cols = ['subject'], prefix = 'subject_')

In [None]:
trn_merge_data = trn_merge_data.merge(train_labels, how = 'left', on = 'sequence')

In [None]:
trn_merge_data.info()

In [None]:
trn_merge_data = trn_merge_data.merge(trn_subjects_merge_data, how = 'left', on = 'subject')
tst_merge_data = tst_merge_data.merge(tst_subjects_merge_data, how = 'left', on = 'subject')

In [None]:
trn_merge_data

In [None]:
trn_merge_data = trn_merge_data.replace([-np.inf, np.inf], [-9999, 9999])

In [None]:
tst_merge_data = tst_merge_data.replace([-np.inf, np.inf], [-9999, 9999])

In [None]:
ignore = ['sequence', 'state', 'subject']
features = [feat for feat in trn_merge_data.columns if feat not in ignore]
target_feature = 'state'

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trn_merge_data[features], trn_merge_data[target_feature], test_size = 0.2, random_state = 7575)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
model = CaCatBoostClassifier(n_estimators = 10000 , task_type = "GPU")
model.fit(X_train, y_train, 
        eval_set = [(X_test, y_test)], 
        eval_metric = ['auc','logloss'], 
        early_stopping_rounds = 64, 
        verbose = 32)

In [None]:
from sklearn.metrics import roc_auc_score
preds = model.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, preds)
print(score)

In [None]:
preds_cat = model.predict_proba(tst_merge_data[features])[:, 1]

In [None]:
xgb = XGBClassifier(n_estimators = 10000 , tree_method = 'gpu_hist')
xgb.fit(X_train, y_train, 
        eval_set = [(X_test, y_test)], 
        eval_metric = ['auc','logloss'], 
        early_stopping_rounds = 64, 
        verbose = 32)

In [None]:
preds = xgb.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, preds)
print(score)

In [None]:
preds_xgb = xgb.predict_proba(tst_merge_data[features])[:, 1]

In [None]:
submission['state'] = 0.7*preds_xgb + 0.3*preds_cat
submission.to_csv('sub_blend3.csv', index = False)