In [None]:
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from math import sqrt
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error,f1_score, accuracy_score

In [None]:
test_clean = pd.read_csv("../input/data-without-drift/test_clean.csv")
test_clean['group'] = -1
x = [[(0,100000),(300000,400000),(800000,900000),(1000000,2000000)],[(400000,500000)], 
     [(100000,200000),(900000,1000000)],[(200000,300000),(600000,700000)],[(500000,600000),(700000,800000)]]
for k in range(5):
    for j in range(len(x[k])): test_clean.iloc[x[k][j][0]:x[k][j][1],2] = k

train_clean = pd.read_csv("../input/data-without-drift/train_clean.csv")
train_clean['group'] = -1
x = [(0,500000),(1000000,1500000),(1500000,2000000),(2500000,3000000),(2000000,2500000)]
for k in range(5): train_clean.iloc[x[k][0]:x[k][1],3] = k

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
window_sizes = [10, 25, 50, 100, 500, 1000, 5000, 10000, 25000]

for window in window_sizes:
    train_clean["rolling_mean_" + str(window)] = train_clean['signal'].rolling(window=window).mean()
    #train_clean["rolling_std_" + str(window)]  = train_clean['signal'].rolling(window=window).std()
    train_clean["rolling_var_" + str(window)]  = train_clean['signal'].rolling(window=window).var()
    train_clean["rolling_min_" + str(window)]  = train_clean['signal'].rolling(window=window).min()
    train_clean["rolling_max_" + str(window)]  = train_clean['signal'].rolling(window=window).max()
    
    train_clean["rolling_min_max_ratio_" + str(window)] = train_clean["rolling_min_" + str(window)] / train_clean["rolling_max_" + str(window)]
    train_clean["rolling_min_max_diff_" + str(window)]  = train_clean["rolling_max_"  + str(window)] - train_clean["rolling_min_" + str(window)]
    
    a = (train_clean['signal'] - train_clean['rolling_min_' + str(window)]) / (train_clean['rolling_max_' + str(window)] - train_clean['rolling_min_' + str(window)])
    train_clean["norm_" + str(window)] = a * (np.floor(train_clean['rolling_max_' + str(window)]) - np.ceil(train_clean['rolling_min_' + str(window)]))
    
train_clean = train_clean.replace([np.inf, -np.inf], np.nan)
train_clean.fillna(0, inplace=True)

for window in window_sizes:
    
    test_clean["rolling_mean_" + str(window)] = test_clean['signal'].rolling(window=window).mean()
    #test_clean["rolling_std_" + str(window)]  = test_clean['signal'].rolling(window=window).std()
    test_clean["rolling_var_" + str(window)]  = test_clean['signal'].rolling(window=window).var()
    test_clean["rolling_min_" + str(window)]  = test_clean['signal'].rolling(window=window).min()
    test_clean["rolling_max_" + str(window)]  = test_clean['signal'].rolling(window=window).max()
    
    test_clean["rolling_min_max_ratio_" + str(window)]  = test_clean["rolling_min_" + str(window)] /  test_clean["rolling_max_" + str(window)]
    test_clean["rolling_min_max_diff_"  + str(window)]  = test_clean["rolling_max_"  + str(window)] - test_clean["rolling_min_" + str(window)]
    
    a = (test_clean['signal'] - test_clean['rolling_min_' + str(window)]) / (test_clean['rolling_max_' + str(window)] - test_clean['rolling_min_' + str(window)])
    test_clean["norm_" + str(window)] = a * (np.floor(test_clean['rolling_max_' + str(window)]) - np.ceil(test_clean['rolling_min_' + str(window)]))
    
test_clean = test_clean.replace([np.inf, -np.inf], np.nan)
test_clean.fillna(0, inplace=True)



In [None]:
train_clean['signal_median'] = train_clean.groupby('group')['signal'].median()
train_clean['signal_mean']   = train_clean.groupby('group')['signal'].mean()
train_clean['signal_min']    = train_clean.groupby('group')['signal'].min()
train_clean['signal_max']    = train_clean.groupby('group')['signal'].max()

train_clean['cum_sum_signal'] = train_clean['signal'].cumsum()
train_clean['cum_perc_signal']= 100*train_clean['cum_sum_signal']/train_clean['signal'].sum()

test_clean['signal_median'] = test_clean.groupby('group')['signal'].median()
test_clean['signal_mean']   = test_clean.groupby('group')['signal'].mean()
test_clean['signal_min']    = test_clean.groupby('group')['signal'].min()
test_clean['signal_max']    = test_clean.groupby('group')['signal'].max()

test_clean['cum_sum_signal'] = test_clean['signal'].cumsum()
test_clean['cum_perc_signal']= 100*test_clean['cum_sum_signal']/test_clean['signal'].sum()

In [None]:
train_clean = reduce_mem_usage(train_clean)
test_clean  = reduce_mem_usage(test_clean)

In [None]:
train_clean.head()

In [None]:
test_clean.head()

In [None]:
y     = train_clean['open_channels']
train = train_clean.drop(['open_channels'],axis=1)
test  = test_clean

In [None]:
del train_clean
del test_clean
gc.collect()

In [None]:
id_train = train['time']
id_test  = test['time']

train = train.drop('time', axis = 1)
test  = test.drop( 'time', axis = 1)

nfolds = 10
folds = KFold(n_splits = 10, shuffle=True, random_state=4590)

In [None]:
param = {'num_leaves': 129,
         'min_data_in_leaf': 148, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "min_child_samples": 24,
         "boosting": "gbdt",
         "feature_fraction": 0.7202,
         "bagging_freq": 1,
         "bagging_fraction": 0.8125 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.3468,
         "verbosity": -1}

In [None]:
feature_importance_df = np.zeros((train.shape[1], nfolds))
mvalid = np.zeros(len(train))
mfull  = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0,y0 = train.iloc[trn_idx], y[trn_idx]
    x1,y1 = train.iloc[val_idx], y[val_idx]
    
    trn_data = lgb.Dataset(x0, label= y0); val_data = lgb.Dataset(x1, label= y1)
    
    num_round = 2500
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds = 250)
    mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)
    
    feature_importance_df[:, fold_] = clf.feature_importance()
    
    mfull += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    
np.sqrt(mean_squared_error(mvalid, y))

In [None]:
ximp = pd.DataFrame()
ximp['feature'] = train.columns
ximp['importance'] = feature_importance_df.mean(axis = 1)

plt.figure(figsize=(14,14))
sns.barplot(x="importance",
            y="feature",
            data=ximp.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

In [None]:
sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")

submission = pd.DataFrame()
submission['time']  = sub['time']
submission['open_channels'] = mfull
submission['open_channels'] = submission['open_channels'].round(decimals=0)
submission['open_channels'] = submission['open_channels'].astype(int)
submission.to_csv('submission.csv', float_format='%0.4f', index = False)

In [None]:
submission.tail()