In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
def downcastMemoryUsage(dataFrame):
    startMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is: \033[1m{:.2f} MB\033[0m'.format(startMemoryOptimization))
    subTypeInt = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
    subTypeFloat = ['float16','float32','float64']
    for column in dataFrame.columns:
        columnType = str(dataFrame[column].dtypes)
        maximumColumn = dataFrame[column].max()
        minimumColumn = dataFrame[column].min()
        if 'int' in columnType:
            for element in subTypeInt:
                if minimumColumn > np.iinfo(element).min and maximumColumn < np.iinfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'float' in columnType:
            for element in subTypeFloat:
                if minimumColumn > np.finfo(element).min and maximumColumn < np.finfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'object' in columnType:
            numberOfUnique = len(dataFrame[column].unique())
            numberOfTotal = len(dataFrame[column])
            if numberOfUnique / numberOfTotal < 0.5:
                dataFrame[column] = dataFrame[column].astype('category')
    endMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: \033[1m{:.2f} MB\033[0m'.format(endMemoryOptimization))
    print('Compressed by: \033[1m{:.2f} %\033[0m'.format(100*(startMemoryOptimization - endMemoryOptimization) / startMemoryOptimization))
    return dataFrame

In [None]:
train =downcastMemoryUsage(train)
test=downcastMemoryUsage(test)

In [None]:
train.head()

In [None]:
from sklearn.preprocessing import QuantileTransformer
from scipy.signal import wiener
from scipy.stats import skew, variation , kurtosis

cols = [col for col in train.columns if 'f_'  in col  and col != 'f_27']
q_cols = []
wiener_cols = []
skew_cols = []
kurtosis_cols = []
variation_cols = []
for col in cols:
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = pd.concat([train, test])[col].values.reshape(vec_len+vec_len_test, 1)
    transformer = QuantileTransformer(n_quantiles = 9, random_state = 42, output_distribution = "normal")
    transformer.fit(raw_vec)
    train[col+'_q'] = transformer.transform(train[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test[col+'_q'] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    train[col+'_wiefilt'] = wiener(train[col])
    test[col+'_wiefilt'] = wiener(test[col])
    train[col+'_skew'] = train[col].apply(skew)
    test[col+'_skew'] = test[col].apply(skew)
    train[col+'_variation'] = train[col].apply(variation)
    test[col+'_variation'] = test[col].apply(variation)
    train[col+'_kurtosis'] = train[col].apply(kurtosis)
    test[col+'_kurtosis'] = test[col].apply(kurtosis)
    wiener_cols.append(col+'_wiefilt')
    q_cols.append(col+'_q')
    skew_cols.append(col+'_skew')
    kurtosis_cols.append(col+'_kurtosis')
    variation_cols.append(col+'_variation')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)

In [None]:
ignore = ['id', 'target', 'f_27']
features = [f for f in test.columns if f != 'id' and f != 'f_27']
target_feature = 'target'

In [None]:
features

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train[features], train[target_feature], test_size = 0.2, random_state = 42)

In [None]:
from xgboost  import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
params = {'n_estimators'    : 42096,
          'objective'       : 'binary:logistic',
          'tree_method'     : 'gpu_hist',
         }

In [None]:
cat_params = {
    'iterations': 11500,
    'learning_rate': 0.02,
    'early_stopping_rounds': 150,
    'max_depth': 5,
    'eval_metric': 'Accuracy',
    'loss_function': 'Logloss',
}

In [None]:
cat = CatBoostClassifier(**cat_params,task_type = 'GPU')
cat.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds = 256, verbose = 250)

In [None]:
from sklearn.metrics import roc_auc_score
val_preds = cat.predict_proba(X_test[features])[:, 1]
roc_auc_score(y_test, val_preds)

In [None]:
xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric = 'auc', early_stopping_rounds = 256, verbose = 250)

In [None]:
from sklearn.metrics import roc_auc_score
val_preds = xgb.predict_proba(X_test[features])[:, 1]
roc_auc_score(y_test, val_preds)

In [None]:
train.info()

In [None]:
test

In [None]:
preds_lgb = lgb.predict_proba(test[features])[:, 1]
preds_cat = cat.predict_proba(test[features])[:, 1]

In [None]:
preds_lgb = lgb.predict_proba(test[features])[:, 1]
preds_xgb = xgb.predict_proba(test[features])[:, 1]

In [None]:
lgb = LGBMClassifier(n_estimators = 8000 , device = "gpu" , min_child_samples=80,num_leaves=127,subsample=0.85, subsample_freq=1)
lgb.fit(X_train, y_train,eval_metric = ['auc'], eval_set = [(X_test, y_test)],early_stopping_rounds = 256, verbose = 250)

In [None]:
from sklearn.metrics import roc_auc_score
val_preds = lgb.predict_proba(X_test[features])[:, 1]
roc_auc_score(y_test, val_preds)

In [None]:
preds = []
for model in models1:
    pred = model.predict_proba(test[features])[:, 1]
    preds.append(pred)
model1_pred = np.mean(preds, axis=0)
preds = []
for model in models2:
    pred = model.predict_proba(test[features])[:, 1]
    preds.append(pred)
model2_pred = np.mean(preds, axis=0)

In [None]:
preds

In [None]:
print(model2_pred)

In [None]:
val_preds = model.predict_proba(X_test[features])[:, 1]
roc_auc_score(y_test, val_preds)

In [None]:
submission

In [None]:
submission['target'] = preds_lgb*0.7 + preds_xgb*0.3
submission.to_csv('blend.csv', index = False)

In [None]:
submission