# 📖1. Dask Introduction

Dask is composed of two parts:

Dynamic task scheduling optimized for computation. This is similar to Airflow, Luigi, Celery, or Make, but optimized for interactive computational workloads.

“Big Data” collections like parallel arrays, dataframes, and lists that extend common interfaces like NumPy, Pandas, or Python iterators to larger-than-memory or distributed environments. These parallel collections run on top of dynamic task schedulers.

The tips and trics how to handle the bigger datasets

![dask](https://miro.medium.com/max/744/0*IZmDXucl3oksi6oF.png)

[10min to dask](https://docs.dask.org/en/latest/10-minutes-to-dask.html)

# 🔨 Loading Operation


>Let`s check how long pandas dataframe will be loaded

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

TRAIN_DATA = '../input/tabular-playground-series-feb-2022/train.csv'
TEST_DATA = '../input/tabular-playground-series-feb-2022/test.csv'


In [None]:
%%timeit
df_pandas = pd.read_csv(TRAIN_DATA)
# Almost 16 second in lazy mode. We have to test eager mode

In [None]:
df_pandas = pd.read_csv(TRAIN_DATA)
df_pandas.head()

<h4>Let`s check how long it will take for the dask </h4>

In [None]:
%%timeit
df_dask = dd.read_csv(TRAIN_DATA,blocksize=50e6) # 50 mb blocks
# 36 ms, more than 400 times faster!!

In [None]:
df_dask = dd.read_csv(TRAIN_DATA,blocksize=50e6) # 50 mb blocks
df_dask.head()

In [None]:
#If you want to change eager to lazy mode 

#dask2pandas = df_dask.compute()

# I recomend to compute it at the final step and make querring using the dask and compute onlu results

In [None]:
%%timeit
# You can use Dask only for loading the data and converting to the pandas. It is still faster than reading in pandas !!
df_dask = dd.read_csv(TRAIN_DATA,blocksize=50e6) # 50 mb blocks, 
dask2pandas = df_dask.compute()

# Pandas .read_csv takes 16 seconds
# Dask .read_csv and compute together takes 11.4 that is around 30% faster
# You can play with the blocksize and share in comment with blocksize is the best for kaggle enviroment

# ⚙ Data Handling - Examples


In [None]:
# Dask code looks pretty the same like in pandas
missing_data = df_dask['target'].value_counts().compute() # Use the compute wisely because it take the most of time of dask script

missing_data = missing_data.reset_index()
missing_data['target'] = missing_data['target'].max() - missing_data['target'] # To count missing


In [None]:
import plotly.express as px

fig = px.bar(missing_data, x='index', y='target',title="Missing data count")
fig.show()

In [None]:
%%timeit
missing_data_pandas = df_pandas['target'].value_counts()
#print(missing_data_pandas)

In [None]:
%%timeit

# Let`s check how the example operations are made:
missing_data_dask = df_dask['target'].value_counts()

# WOW around 30 times faster than pandas

In [None]:
#Why we are using the .compute() ?
missing_data_dask = df_dask['target'].value_counts()

# The returned object is the Dask Series Structure. We can perform multiple operations but it stores "logic". When we want to get the value we have to make the compute
missing_data_dask

In [None]:
%%timeit
missing_data_dask.compute()
# The compute "generate" the data according to the logic we created.

# 🔭 Code Profiling 

>The %%timeit is just simple measurement of the time using the magic functions in the jupyter.<br>
I will show you how to profile the code to analyse the ETL pipeline. This is usefull for big dataset especially exceedig the RAM capa.<br>
Our Kaggle dataset fits in the RAM capa but I will show techniques how to handle very big datasets.

![profiling](https://static1.smartbear.co/smartbearbrand/files/13/139b726c-f193-4785-b1be-abd4be7980d7.png)

In [None]:
# We have to create some profiling tools

import time
import os
import psutil
import inspect


def elapsed_since(start):
    #return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
    elapsed = time.time() - start
    if elapsed < 1:
        return str(round(elapsed*1000,2)) + "ms"
    if elapsed < 60:
        return str(round(elapsed, 2)) + "s"
    if elapsed < 3600:
        return str(round(elapsed/60, 2)) + "min"
    else:
        return str(round(elapsed / 3600, 2)) + "hrs"


def get_process_memory():
    process = psutil.Process(os.getpid())
    mi = process.memory_info()
    return mi.rss, mi.vms, mi.shared


def format_bytes(bytes):
    if abs(bytes) < 1000:
        return str(bytes)+"B"
    elif abs(bytes) < 1e6:
        return str(round(bytes/1e3,2)) + "kB"
    elif abs(bytes) < 1e9:
        return str(round(bytes / 1e6, 2)) + "MB"
    else:
        return str(round(bytes / 1e9, 2)) + "GB"


def profile(func, *args, **kwargs):
    def wrapper(*args, **kwargs):
        rss_before, vms_before, shared_before = get_process_memory()
        start = time.time()
        result = func(*args, **kwargs)
        elapsed_time = elapsed_since(start)
        rss_after, vms_after, shared_after = get_process_memory()
        print("Profiling: {:>20}  RSS: {:>8} | VMS: {:>8} | SHR {"
              ":>8} | time: {:>8}"
            .format("<" + func.__name__ + ">",
                    format_bytes(rss_after - rss_before),
                    format_bytes(vms_after - vms_before),
                    format_bytes(shared_after - shared_before),
                    elapsed_time))
        return result
    if inspect.isfunction(func):
        return wrapper
    elif inspect.ismethod(func):
        return wrapper(*args,**kwargs)

In [None]:
@profile
def dask_loader(src):
    df_dask = dd.read_csv(src,blocksize=50e6)
    return df_dask

@profile
def pandas_loader(src):
    df_pandas = pd.read_csv(src) # 50 mb blocks
    return df_pandas

@profile
def dask_loader_compute(src):
    df_dask2pandas = dask_loader(src).compute()
    return df_dask2pandas

# The time will be longer because the profiling is attached to it
    
pandas_loader(TRAIN_DATA)
_ = dask_loader_compute(TRAIN_DATA)

As we can see the data in dask loader before compute is not highly ram consumable. We can preform multiple ETL steps before we finally generate the data.
I will also show how to store the results in the specyfic format to allow fast readout.

In [None]:
#!pip install dask_ml
#!pip install scikit-learn==0.23.1

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
#from dask_ml import preprocessing
#from dask_ml.model_selection import train_test_split



In [None]:
train_df = dd.read_csv(TRAIN_DATA,blocksize=50e6).dropna(how='any').compute()
test_df = dd.read_csv(TEST_DATA,blocksize=50e6).compute()

In [None]:
# Use the dask to preprocess whole data and just compute at the last step.
target_encoder = LabelEncoder()
train_df["target"] = target_encoder.fit_transform(train_df["target"])

X = train_df.drop(["target"], axis=1)
y = train_df["target"]

# 🚅 Training Ensembled Meta Classifier

The ensemble technique works best when the base models are not correlated. We have 3 basics concept of ensembling techniques
Max Voting
The prediction from each model is a vote. In max voting the final prediction come from the most votes

* classifier 1 – class A
* classifier 2 – class B
* classifier 3 – class B
* Output:        class B (Averaging)
<br>
The final output is an average of all predictons (regression problems)

* regressor 1 – 200
* regressor 2 – 300 
* regressor 3 – 400
* Output:    300  (Weighted Averaging)
<br>
The base model with higher predictive power is more important.


## ⚙ Training Configuration

In [None]:
# Training routine
SEED = 1992
lgbm_params = {
    'metric': 'softmax',
    'n_estimators': 4000,
    'objective': 'multiclass',
    'random_state': SEED,
    'learning_rate': 0.025,
    'min_child_samples': 150,
    'reg_alpha': 3e-5,
    'reg_lambda': 9e-2,
    'num_leaves': 20,
    'max_depth': 16,
    'colsample_bytree': 0.8,
    'subsample': 0.65,
    'subsample_freq': 2,
    'max_bin': 240,
    'device':'gpu'
}


ETC_params = {
    'bootstrap':True,
    'criterion': 'entropy',
    'max_features': 0.55,
    'min_samples_leaf': 8,
    'min_samples_split': 4,
    'n_estimators': 150
}

cat_params = {#'iterations': 5000,
          'eval_metric': 'AUC',
          'loss_function':'Logloss',
          'od_type':'Iter',
          'num_trees':50000,
          'max_depth': 6, 
          'l2_leaf_reg': 3,
          'bootstrap_type': 'Bayesian',
          'max_bin': 254,
          'grow_policy': "Lossguide",
          'random_seed': 314,
          'min_data_in_leaf': 64,
          'verbose': None,
          'logging_level': 'Silent',
          'task_type': 'GPU'
}

## 📁 Training Imports

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingCVClassifier,EnsembleVoteClassifier

## 🔨 Initialize Meta Classification

In [None]:
# Let`s train 2 models and make hard voting

cl1 = LGBMClassifier(**lgbm_params)
cl2 = ExtraTreesClassifier(**ETC_params)
cl3 = CatBoostClassifier(**cat_params)
# Hard Voting Ensemble
S_eclf = EnsembleVoteClassifier(clfs=[cl1, cl2, cl3],
                              weights=[1, 1, 1], voting='hard')


In [None]:
y_probs = []
scores = []

folds = StratifiedKFold(n_splits=8, shuffle=True)

for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
    X_train = X.iloc[train_id]
    y_train = y.iloc[train_id]
    X_valid = X.iloc[test_id]
    y_valid = y.iloc[test_id]
    
    
    S_eclf.fit(X_train, y_train) 

    
    #model.fit(X_train, y_train)
    
    valid_pred = S_eclf.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold + 1, "Accuracy:", valid_score)
    
    scores.append(valid_score)
    
    # Save predictions to later submit the mean values
    #if submission: 
    y_probs.append(S_eclf.predict_proba(test_df))
    

In [None]:
# Mean Accuracy Score
print("Mean accuracy score:", np.array(scores).mean())

# 📖 Submission

In [None]:
y_prob = sum(y_probs) / len(y_probs)
# The explanations for these numbers are in AMBROSM's code
y_prob += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test_df.index).value_counts().sort_index() / len(test_df) * 100

In [None]:
sub = dd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv").compute()

sub["target"] = y_pred_tuned
sub.to_csv("submission.csv", index=False)

<h4> Notebook in progress Upvote if you like :) </h4>
