# Importing Modules

In [None]:
import gc
import numpy as np
import pandas as pd
from math import sqrt
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

# importing Data and creating Bins

In [None]:
test_clean = pd.read_csv("../input/data-without-drift/test_clean.csv")
test_clean['group'] = -1
x = [[(0,100000),(300000,400000),(800000,900000),(1000000,2000000)],[(400000,500000)], 
     [(100000,200000),(900000,1000000)],[(200000,300000),(600000,700000)],[(500000,600000),(700000,800000)]]
for k in range(5):
    for j in range(len(x[k])): test_clean.iloc[x[k][j][0]:x[k][j][1],2] = k

train_clean = pd.read_csv("../input/data-without-drift/train_clean.csv")
train_clean['group'] = -1
x = [(0,500000),(1000000,1500000),(1500000,2000000),(2500000,3000000),(2000000,2500000)]
for k in range(5): train_clean.iloc[x[k][0]:x[k][1],3] = k

# Reducing Memory
Else the notebook will crash due to overhead of memory.

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Feature Engineering

In [None]:
window_sizes = [10, 25, 50, 100, 500, 1000, 5000, 10000, 25000]

for window in window_sizes:
    train_clean["rolling_mean_" + str(window)] = train_clean['signal'].rolling(window=window).mean()
    #train_clean["rolling_std_" + str(window)]  = train_clean['signal'].rolling(window=window).std()
    train_clean["rolling_var_" + str(window)]  = train_clean['signal'].rolling(window=window).var()
    train_clean["rolling_min_" + str(window)]  = train_clean['signal'].rolling(window=window).min()
    train_clean["rolling_max_" + str(window)]  = train_clean['signal'].rolling(window=window).max()
    
    train_clean["rolling_min_max_ratio_" + str(window)] = train_clean["rolling_min_" + str(window)] / train_clean["rolling_max_" + str(window)]
    train_clean["rolling_min_max_diff_" + str(window)]  = train_clean["rolling_max_"  + str(window)] - train_clean["rolling_min_" + str(window)]
    
    a = (train_clean['signal'] - train_clean['rolling_min_' + str(window)]) / (train_clean['rolling_max_' + str(window)] - train_clean['rolling_min_' + str(window)])
    train_clean["norm_" + str(window)] = a * (np.floor(train_clean['rolling_max_' + str(window)]) - np.ceil(train_clean['rolling_min_' + str(window)]))
    
train_clean = train_clean.replace([np.inf, -np.inf], np.nan)
train_clean.fillna(0, inplace=True)

In [None]:
for window in window_sizes:
    
    test_clean["rolling_mean_" + str(window)] = test_clean['signal'].rolling(window=window).mean()
    #test_clean["rolling_std_" + str(window)]  = test_clean['signal'].rolling(window=window).std()
    test_clean["rolling_var_" + str(window)]  = test_clean['signal'].rolling(window=window).var()
    test_clean["rolling_min_" + str(window)]  = test_clean['signal'].rolling(window=window).min()
    test_clean["rolling_max_" + str(window)]  = test_clean['signal'].rolling(window=window).max()
    
    test_clean["rolling_min_max_ratio_" + str(window)]  = test_clean["rolling_min_" + str(window)] /  test_clean["rolling_max_" + str(window)]
    test_clean["rolling_min_max_diff_"  + str(window)]  = test_clean["rolling_max_"  + str(window)] - test_clean["rolling_min_" + str(window)]
    
    a = (test_clean['signal'] - test_clean['rolling_min_' + str(window)]) / (test_clean['rolling_max_' + str(window)] - test_clean['rolling_min_' + str(window)])
    test_clean["norm_" + str(window)] = a * (np.floor(test_clean['rolling_max_' + str(window)]) - np.ceil(test_clean['rolling_min_' + str(window)]))
    
test_clean = test_clean.replace([np.inf, -np.inf], np.nan)
test_clean.fillna(0, inplace=True)


In [None]:
#train_clean['signal_median'] = train_clean.groupby('group')['signal'].median()
#train_clean['signal_mean']   = train_clean.groupby('group')['signal'].mean()
#train_clean['signal_min']    = train_clean.groupby('group')['signal'].min()
#train_clean['signal_max']    = train_clean.groupby('group')['signal'].max()

train_clean['cum_sum_signal'] = train_clean['signal'].cumsum()
train_clean['cum_perc_signal']= 100*train_clean['cum_sum_signal']/train_clean['signal'].sum()

In [None]:
#test_clean['signal_median'] = test_clean.groupby('group')['signal'].median()
#test_clean['signal_mean']   = test_clean.groupby('group')['signal'].mean()
#test_clean['signal_min']    = test_clean.groupby('group')['signal'].min()
#test_clean['signal_max']    = test_clean.groupby('group')['signal'].max()

test_clean['cum_sum_signal'] = test_clean['signal'].cumsum()
test_clean['cum_perc_signal']= 100*test_clean['cum_sum_signal']/test_clean['signal'].sum()

In [None]:
train_clean = reduce_mem_usage(train_clean)
test_clean  = reduce_mem_usage(test_clean)

In [None]:
train_clean.head()

In [None]:
test_clean.head()

In [None]:
y     = train_clean['open_channels']
train = train_clean.drop(['open_channels'],axis=1)
test  = test_clean
train.head()

In [None]:
classes = np.unique(y)
print(len(classes))

In [None]:
test.head()

# Collecting Garbage

In [None]:
del train_clean   # Delete the copy of train data.
del test_clean    # Delete the copy of test data.
gc.collect()      # Collect the garbage.

# Grouping KFold Technique as a CV strategy.

# Training and Testing Data

In [None]:
id_train = train['time']
id_test  = test['time']

train = train.drop('time', axis = 1)
test  = test.drop( 'time', axis = 1)

nfolds = 5
groups = np.array(train.signal.values)
folds = GroupKFold(n_splits = 5)

In [None]:
param = {'num_leaves': 129,
         'min_data_in_leaf': 148, 
         'objective':'multiclass',
         'max_depth': 7,
         'learning_rate': 0.00987173774816051,
         "min_child_samples": 24,
         "feature_fraction": 0.7202,
         "bagging_freq": 1,
         "bagging_fraction": 0.8125 ,
         "bagging_seed": 11,
         "metric": 'multi_logloss',
         "lambda_l1": 0.3468,
         "verbosity": -1, 
         'num_class': 11}

In [None]:
def get_class_weight(classes, exp=1):
    '''
    Weight of the class is inversely proportional to the population of the class.
    There is an exponent for adding more weight.
    '''
    hist, _ = np.histogram(classes, bins=np.arange(12)-0.5)
    class_weight = hist.sum()/np.power(hist, exp)
    
    return class_weight

# Train the model

In [None]:
%%time
feature_importance_df = np.zeros((train.shape[1], nfolds))
preds = np.zeros(2000000*11).reshape((2000000, 11))

f1s = []
accuracies = []
precisions = []
recalls = []

#mvalid = np.zeros([len(train), len(classes)])
#mfull  = np.zeros([len(test), len(classes)])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values, groups)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0,y0 = train.iloc[trn_idx], y[trn_idx]
    x1,y1 = train.iloc[val_idx], y[val_idx]
    
    class_weight = get_class_weight(y0)
    
    trn_data = lgb.Dataset(x0, label=y0, weight=class_weight[y0])
    val_data = lgb.Dataset(x1, label=y1, weight=class_weight[y1])
    
    num_round = 500
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds = 100)
    gbc_pred = clf.predict(x1, num_iteration=clf.best_iteration)
    
    f1 = f1_score(y1, np.argmax(gbc_pred, axis=1), average='macro')
    precision = precision_score(y1, np.argmax(gbc_pred, axis=1), average='macro')
    recall = recall_score(y1, np.argmax(gbc_pred, axis=1), average='macro')
    accuracy = accuracy_score(y1, np.argmax(gbc_pred, axis=1))
    
    print("F1 Score for LGBM: ", str(f1))
    print("Precision Score for LGBM: ", str(precision))
    print("Recall Score for LGBM: ", str(recall))
    print("Accuracy Score for LGBM: ", str(accuracy))
    
    f1s.append(f1)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    
    preds += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    preds_rounded = np.asarray([np.argmax(line) for line in preds])
    
print("F1 Scores over {} folds: {}".format(fold_, f1s))
print("Accuracy Scores over {} folds: {}".format(fold_, accuracies))
print("Precision Scores over {} folds: {}".format(fold_, precisions))
print("Recall Scores over {} folds: {}".format(fold_, recalls))

In [None]:
sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")
#sub.head()

# Prepare file for the submission
We have to conver the 'open_channels' column to make it look like a classifier problem, we used a regressor to predict the values. For this I did:
1. The rounding of the values predicted.
2. Converting the datatype of 'open_channels' from float to int.

In [None]:
submission = pd.DataFrame()
submission['time']  = sub['time'] #id_test
submission['open_channels'] = preds_rounded
submission['open_channels'] = submission['open_channels'].round(decimals=0)   # Round the 'open_channels' values to the nearest decimal as we implemented a regressor.
submission['open_channels'] = submission['open_channels'].astype(int)         # Convert the datatype of 'open_channels' from float to integer to match the requirements of submission.
submission.to_csv('submission.csv', index = False,float_format='%.4f')

In [None]:
submission.tail()