In [None]:
import sys
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

# Necessary imports

In [None]:
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from math import sqrt
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from cudf.core.dataframe import DataFrame as cu_df
from cudf.core.series import Series as cu_series

# Load train and test data

In [None]:
train = pd.read_csv('../input/ion-switch-model-ready-data-frame-to-work-locally/train_ion_switch.csv')
test  = pd.read_csv('../input/ion-switch-model-ready-data-frame-to-work-locally/test_ion_switch.csv')

# Memory Reduction
Else file will crash due to excessive memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
y     = train['open_channels']
train = train.drop(['open_channels'],axis=1)

In [None]:
classes = np.unique(y)
print(len(classes))

# GroupK-Fold Technique for Cross-Validation

In [None]:
id_train = train['time']
id_test  = test['time']

train = train.drop('time', axis = 1)
test  = test.drop( 'time', axis = 1)

nfolds = 5
groups = np.array(train.signal.values)
folds = GroupKFold(n_splits=5)

In [None]:
param = {'min_child_weight': 7, 'colsample_bytree': 0.7, 'max_depth': 10, 'eta': 0.2,
         'subsample': 0.6, 'lambda': 2, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0, 
         'alpha': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'tree_method': 'gpu_hist',
         'num_class': 11}

# Train the model

In [None]:
%%time
#feature_importance_df = np.zeros((train.shape[1], nfolds))#
mvalid = np.zeros([len(train), len(classes)])
mfull  = np.zeros([len(test), len(classes)])

print(np.shape(mvalid), np.shape(mfull))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values, groups)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0,y0 = train.iloc[trn_idx], y[trn_idx]
    x1,y1 = train.iloc[val_idx], y[val_idx]
    
    x0_cudf = cu_df(x0)
    y0_cudf = cu_series(y0)
    x1_cudf = cu_df(x1)
    y1_cudf = cu_series(y1)
    
    trn_data = xgb.DMatrix(x0_cudf, label= y0_cudf); val_data = xgb.DMatrix(x1_cudf, label= y1_cudf)
    
    num_round = 1000
    clf = xgb.train(param, trn_data, num_round, evals=[(val_data, "val_data")], verbose_eval=200, early_stopping_rounds = 100)
    
    mvalid[val_idx] = clf.predict(xgb.DMatrix(x1_cudf), ntree_limit=clf.best_iteration)
    mvalid_rounded = np.asarray([np.argmax(line) for line in mvalid])
    
    pd.DataFrame(mvalid_rounded).to_csv('xgb_val_preds_fold' + str(fold_) + '.csv', index=False)
    
    #feature_importance_df[:, fold_] = clf.feature_importances_()
    
    test_cudf = cu_df(test)
    mfull += clf.predict(xgb.DMatrix(test_cudf), ntree_limit=clf.best_iteration) / folds.n_splits
    mfull_rounded = np.asarray([np.argmax(line) for line in mfull])
    
    pd.DataFrame(mfull).to_csv('xgb_preds_fold' + str(fold_) + '.csv', index=False)

print("Recall Score: " + str(recall_score(mvalid_rounded, y, average='macro')))
print("Accuracy Score: " + str(accuracy_score(mvalid_rounded, y)))
print("Precision Score: " + str(precision_score(mvalid_rounded, y, average='macro')))

# File submission

In [None]:
sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")

submission = pd.DataFrame()
submission['time']  = sub['time']
submission['open_channels'] = mfull_rounded
submission['open_channels'] = submission['open_channels'].round(decimals=0)   # We used the regressor, so to convert to the class values we round it to the nearest decimal.
submission['open_channels'] = submission['open_channels'].astype(int)         # And finally convert those values to integer to make the final submission file.
submission.to_csv('submission.csv', float_format='%0.4f', index = False)

In [None]:
submission.tail()