## Official LightAutoML github repository is [here](https://github.com/sberbank-ai-lab/LightAutoML)

## Upvote is the best motivator 👍

# Step 0.0. LightAutoML installation

This step can be used if you are working inside Google Colab/Kaggle kernels or want to install LightAutoML on your machine:

In [None]:
!pip install -U lightautoml

# Step 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: presets for AutoML, task and report generation module

In [None]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.report.report_deco import ReportDeco
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDecoNLP

# Step 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [None]:
N_THREADS = 4
N_FOLDS = 2
RANDOM_STATE = 22
TIMEOUT =  2*3600
TARGET_NAME = 'target'

# Step 0.3. Imported models setup

For better reproducibility we fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)   

# Step 0.4. Data loading
Let's check the data we have:

In [None]:
train_data = pd.read_csv('../input/lightautoml-course-finals/train.csv')
test_data = pd.read_csv('../input/lightautoml-course-finals/test.csv')
sample_sub = pd.read_csv('../input/lightautoml-course-finals/sample_submission.csv')

# Data shift 

In [None]:

from scipy import stats
from scipy.stats import mannwhitneyu


def shift_info(train_data, test_data):
    # Выбираем категориальные и численный переменные
    cat_cols = []
    num_cols = []
    for col in train_data.columns:
        score_cat = train_data[col].nunique()/len(train_data[col])*100
        if score_cat < 1 and col not in ['text_1', 'text_0',]:  # cat is len then 0.1
            cat_cols.append(col)
        else:
            num_cols.append(col)

    print(len(cat_cols))



    sig_diff_col = []

    for col in train_data.columns:
        if col in test_data.columns and col not in ['text_0', 'text_1', 'row_id']:
            #if train_data[col].mean()

            print(col)
            t, p = stats.ttest_ind(train_data[col].values, test_data[col].values)
            U1, p_2 = mannwhitneyu(train_data[col].values, test_data[col].values)

            if p < 0.000001 and p_2 < 0.000001 and col not in cat_cols:

                print(col, 'p:', p, 'p_2:', p_2)
                print('mean',train_data[col].mean(),'std:', train_data[col].std())
                print('mean',test_data[col].mean(),'std:', test_data[col].std())

                sig_diff_col.append(col)
    return cat_cols, num_cols, sig_diff_col
       
# cat_cols, num_cols, sig_diff_col = shift_info(train_data, test_data) 

In [None]:
# shift_info(train_data, test_data)

In [None]:
import sys

def optimize_dataframe(df):
    """Optimize pandas dataframe size:
    - downcast numeric (int and float) types columns.
    - convert to Categorical type categorical columns with 2x or more "values/unique" values rate.
    :param df:
    :return:
    """

    #return df  # TODO: remove - check for failure!!!

    int_cols = []
    float_cols = []
    category_cols = []
    other_cols = []

    old_size = sys.getsizeof(df)

    for col_name in df.columns:
        col_type = df.dtypes[col_name]

        if col_type in ['int', 'int16', 'int32', 'int64']:
            int_cols.append(col_name)
        elif col_type in ['float', 'float16', 'float32', 'float64']: # float 16
            float_cols.append(col_name)
        elif col_type == 'object':
            total = len(df[col_name])
            n_uniq = df[col_name].nunique()
            if n_uniq / total < 0.5:
                category_cols.append(col_name)
            else:
                other_cols.append(col_name)
        else:
            other_cols.append(col_name)

    df_opt = pd.DataFrame()

    if len(int_cols) > 0:
        df_opt[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')

    if len(float_cols) > 0:
        df_opt[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')

    if len(category_cols) > 0:
        df_opt[category_cols] = df[category_cols].astype('category')

    if len(other_cols) > 0:
        df_opt[other_cols] = df[other_cols]

    new_size = sys.getsizeof(df_opt)
    print('optimize dataframe ({} to {}, ratio: {})'.format(old_size, new_size, round(old_size/new_size, 2)))

    return df_opt

In [None]:
print(train_data.shape)
train_data.head(3)

# Basic Feature generation

In [None]:
def new_features(data):
    
    data['feat_6_7_diff']  = abs(data['feat_6'] -  data['feat_7'])  
    
    data['feat_84__feat_86'] = data['feat_84'] - data['feat_86']
    data['feat_84_85_86_mean'] = (data['feat_84'] + data['feat_85'] + data['feat_86'])/3
    data['feat_86_85_84_multi'] = data['feat_86']*data['feat_85']*data['feat_84']
    
    data['feat_90__feat_91']  = abs(data['feat_90'] -  data['feat_91'])  
    
    data['feat_90_91_mean'] = data[['feat_90','feat_91']].mean(axis=1)
    data['feat_93_94_95_mean'] = data[['feat_93','feat_94','feat_95']].mean(axis=1)
    data['feat_96_98_mean'] = data[['feat_96','feat_98']].mean(axis=1)
    data['feat_99_100_101_mean'] = data[['feat_99','feat_100','feat_101']].mean(axis=1)
    data['feat_102_103_104_mean'] = data[['feat_102','feat_103','feat_104']].mean(axis=1)
    
    
    data['feat_84_85_86___96_98_diff'] = data['feat_96_98_mean'] - data['feat_84_85_86_mean']
    
    return data
    
    
train_data = new_features(train_data)
test_data = new_features(test_data)

In [None]:
from tqdm import tqdm


def add_featreus_1(data):
    data['text_0_len'] = data['text_0'].apply(lambda x: len(x.split()))
    data['text_1_len'] = data['text_1'].apply(lambda x: len(x.split()))
    data['text_len_diff'] = data['text_0_len'] - data['text_1_len']
    
    data['text_1_len_unique'] = data['text_1'].apply(lambda x: len(np.unique(x.split())))
    data['text_0_len_unique'] = data['text_0'].apply(lambda x: len(np.unique(x.split())))
    data['text_len_unique_diff'] = data['text_0_len_unique'] - data['text_1_len_unique']
    return data

def add_features_2(data):
    data['text_0_count'] = data['text_0'].apply(lambda x: len(x.split()))
    data['text_1_count'] = data['text_1'].apply(lambda x: len(x.split()))
    data['text_count_diff'] = data['text_0_count'] - data['text_1_count']
    data['text_1_count_unique'] = data['text_1'].apply(lambda x: len(np.unique(x.split())))
    data['text_0_count_unique'] = data['text_0'].apply(lambda x: len(np.unique(x.split())))
    data['text_count_unique_diff'] = data['text_0_count_unique'] - data['text_1_count_unique']
    return data
 

def add_common_feat(data):
    text_common_list, text_diff_list = [], []
    text_internal, text_0_internal, text_1_internal = [], [], []
    text_0_diff_uniq, text_1_diff_uniq = [], []
    text_0_uniq_part, text_1_uniq_part = [], []
    
    for x,y in tqdm(data[['text_0', 'text_1']].values):
        x = [i.replace('[','').replace(']','') for i in x.split('  ')]
        y = [i.replace('[','').replace(']','') for i in y.split('  ')]
        text_common_list.append(len(set(x)&set(y))), text_diff_list.append(len(set(y)|set(x)))
        
        text_0_diff_uniq.append(len(x) - len(set(x)))
        text_1_diff_uniq.append(len(y) - len(set(y)))
        
        text_0_uniq_part.append((len(set(x)&set(y))+1)/len(set(x)))
        text_1_uniq_part.append((len(set(x)&set(y))+1)/len(set(y)))
        
        text_internal.append(len(set(x)|set(y) - set(x)&set(y)))
        text_0_internal.append(len(set(x) - set(x)&set(y)))  
        text_1_internal.append(len(set(y) - set(x)&set(y)))  
        
    
    data['text_common'] = text_common_list
    data['text_diff'] = text_diff_list
    data['text_internal'] = text_internal
    data['text_0_internal'] = text_0_internal
    data['text_1_internal'] = text_1_internal
    data['text_0_diff_uniq'] = text_0_diff_uniq
    data['text_1_diff_uniq'] = text_1_diff_uniq
    data['text_0_uniq_part'] = text_0_uniq_part
    data['text_1_uniq_part'] = text_1_uniq_part
    
    return data


train_data = add_featreus_1(train_data)
test_data = add_featreus_1(test_data)


for col in ['text_0', 'text_1']:
    train_data[col] = train_data[col].map(lambda x: ' '.join(x[1:-1].split(',')))
    test_data[col] = test_data[col].map(lambda x: ' '.join(x[1:-1].split(',')))

train_data.head()

# Advanced Feature Engeeniring 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

vectorizer = TfidfVectorizer(max_features=2000)
vectorizer.fit(list(train_data['text_0'].values) + list(train_data['text_1'].values) + list(test_data['text_0'].values) + list(test_data['text_1'].values))
pca = PCA(n_components = 6)
kbest = SelectKBest(chi2, k=1100)



def add_features_bags(data, pca_, kbest_, col = 'text_0', train = True, acc = 5):
    print(len(data), 'data 1')
    bags = vectorizer.transform(list(data[col].values)).toarray() # + list(data['text_1'].values)
    print('Извлек фичи')
    #bags = np.round(bags, acc) # Уменьшаем загрузку по памяти
    means = bags.mean(axis=1)
    maxs = bags.max(axis=1)
    stds = bags.std(axis=1) 
    
    if train:
        bags = kbest_.fit_transform(bags, data['target'])
    else:
        bags = kbest_.transform(bags)
    
    if train:
        pca_.fit(bags)
        components = pca_.transform(bags)
        del bags
        components = np.round(components, acc)
    else:
        components = pca_.transform(bags)
        del bags
        components = np.round(components, acc)
        
    components = pd.DataFrame(components, columns = [col + '_component_1', col + '_component_2', col + '_component_3',
                                                    col + '_component_4', col + '_component_5', col + '_component_6'])
    assert len(data) == len(means)
    data[col + '_' + 'bag_of_words_mean'] = means
    data[col + '_' + 'bag_of_words_max'] = maxs
    data[col + '_' + 'bag_of_words_std'] = stds
    del stds, maxs, means
    
    
    assert len(data) == len(components)
    data = pd.concat([data, components], axis=1).reset_index(drop = True)
    assert len(data) == len(components)
    
    print(data.shape, 'data 2')
    return data, pca_, kbest_

In [None]:
train_data = optimize_dataframe(train_data) # Оптимизируем запись формата чисел в pandas dataframe
test_data = optimize_dataframe(test_data)

In [None]:
print(train_data.shape)
train_data, pca_f_0, kbest_0= add_features_bags(train_data, pca_ = pca, kbest_ = kbest, col = 'text_0') 
print(train_data.shape)
train_data, pca_f_1, kbest_1 = add_features_bags(train_data, pca_ = pca, kbest_ = kbest, col = 'text_1')
print(train_data.shape)
train_data.head()

In [None]:
train_data = optimize_dataframe(train_data)
test_data = optimize_dataframe(test_data)

In [None]:
print(test_data.shape)
test_data, _, _= add_features_bags(test_data, pca_ = pca_f_0, kbest_ = kbest_0, col = 'text_0', train = False) 
print(test_data.shape)
test_data, _, _= add_features_bags(test_data, pca_ = pca_f_1, kbest_ = kbest_1, col = 'text_1', train = False) 
print(test_data.shape)
# test_data.head()

In [None]:
train_data = optimize_dataframe(train_data)
test_data = optimize_dataframe(test_data)


train_data = add_features_2(train_data)
test_data = add_features_2(test_data)


train_data= add_common_feat(train_data)
test_data= add_common_feat(test_data)

train_data.head()

In [None]:
from scipy.stats.stats import pearsonr   
from scipy.spatial import distance
 
vectorizer = TfidfVectorizer(max_features=3000)
vectorizer.fit(list(train_data['text_0'].values) + list(train_data['text_1'].values) + list(test_data['text_0'].values) + list(test_data['text_1'].values))

def add_common_feat_adv(data):

    a = vectorizer.transform(list(data['text_0'].values)).toarray()
    b = vectorizer.transform(list(data['text_1'].values)).toarray() 
    
    print('Извлек фичи')
    v_mean = (a + b)/2  # len * max_features
    v_mean_mean = v_mean.mean(axis=1)
    v_mean_max = v_mean.max(axis=1)
    v_mean_std = v_mean.std(axis=1)
    
    assert len(data) == len(v_mean_mean) == len(v_mean_max)
    data['v_mean_mean'] = v_mean_mean
    data['v_mean_max'] = v_mean_max
    data['v_mean_std'] = v_mean_std    
    del v_mean_mean, v_mean_max, v_mean_std, v_mean

    v = a - b
    manht_dist = v.mean(axis=1)
    del v
    
    v_squre = (a - b)**2
    evclid_dist = v_squre.mean(axis=1)
    
    assert len(data) == len(manht_dist) == len(evclid_dist)
    data['v_mean'] = manht_dist
    data['v_mean_squre'] = evclid_dist
    del manht_dist, evclid_dist, v_squre
    
#     print('Самый тяжелый кусок')
    corr_pear_coef_list, corr_pear_p_list, coss_dist_list = [], [], []
    for i in tqdm(range(len(a))):
        per = pearsonr(a[i,:], b[i,:])
        corr_pear_coef_list.append(per[0])
        corr_pear_p_list.append(np.log(per[1] +(1/10)**100))
        coss_dist_list.append(distance.cosine(a[i,:], b[i,:]))
    
    assert len(data) == len(corr_pear_p_list) == len(coss_dist_list)
    data['corr_pear_coef'] = corr_pear_coef_list
    data['corr_pear_p'] = corr_pear_p_list
    data['coss_dist_mean'] = coss_dist_list
    
    return data

train_data = optimize_dataframe(train_data)
test_data = optimize_dataframe(test_data)

def make_by_pice(data, func, step = 50000):
    full_data = []
    folds = len(data)//step + 1
    for fold in range(folds):
        sub = func(data.iloc[fold*step:(fold+1)*step])
        
        full_data.append(sub)

    full_data = pd.concat(full_data)
    
    print(len(full_data), len(data))
    assert len(full_data) == len(data)
    return full_data



In [None]:
print(train_data.shape)
train_data = make_by_pice(train_data, add_common_feat_adv)
print(train_data.shape)
test_data = make_by_pice(test_data, add_common_feat_adv)
train_data.head(3)

In [None]:
del pca_f_1, pca_f_0, pca, vectorizer, _, kbest_0, kbest_1

In [None]:
# ## sub_cor = train_data[cat_cols].corr()


# plt.figure(figsize=(30,30))
# print('plot heat map')
# g=sns.heatmap(train_data[num_cols + ['target']].corr(), annot=True, cmap="RdYlGn", vmin=0.1)
# #Verdict : Glucose, BMI and Age are having high correlation with diabetes

# Step 0.5. Pseudolabelling
Let's import the predictions from the NLP benchmark and use them in pseudolabelling technique:

In [None]:
pseudolabels = pd.read_csv('../input/pseudalabels/lightautoml_NLP_benchmark_with_pseudolabelling.csv')
pseudolabels.head()

In [None]:
test_data[TARGET_NAME] = pseudolabels[TARGET_NAME].values
del pseudolabels
train_data = optimize_dataframe(train_data)
test_data = optimize_dataframe(test_data)

In [None]:
print(train_data.shape, test_data.shape)
ALL_DF = pd.concat([train_data, test_data], axis=0).reset_index(drop = True)
print(ALL_DF.shape)

ALL_DF['weight'] = [1.001] * len(train_data) + [0.999] * len(test_data)  #### Fixed!!!!!!!!!!!
ALL_DF = optimize_dataframe(ALL_DF)
# del train_data, test_data

In [None]:
ALL_DF.info()

# =============== LightAutoML model building ===============


# Step 1. Task setup

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [None]:
def f1_metric(y_true, y_pred, sample_weight, **kwargs):
    mask = (sample_weight > 1)
    return f1_score(y_true[mask], (y_pred[mask] > 0.5).astype(int), **kwargs)  # 0.5 -> 0.49 fixed

task = Task('binary', metric = f1_metric, greater_is_better = True)

# Step 2. Feature roles setup

To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [None]:
roles = {'target': 'target',
         'text': ['text_0', 'text_1'],
         'weights': 'weight',  # for pseudolabelling
         'drop' : ['row_id',  'feat_52', 'feat_53', 'feat_54', 'feat_55', 'feat_66', 'feat_81' , 'feat_69' ,
                   'feat_41', 'feat_39', 'feat_67', 'feat_82', 'feat_68', 'feat_83', 
                   'feat_27', 'feat_13', 'feat_11', 'feat_26', 'feat_12', 
                   'feat_51', 'feat_36', 'feat_40', 'feat_38', 'feat_63', 'feat_65','feat_56', "feat_80"
                  # 'feat_54', 'feat_77'
                   
                   # 'feat_9', 'feat_6', 'feat_20'  # имхо нет начиная с feat_9
                   
                  ]  # + sig_diff_col_to_drop # sig_diff_col # 77 80 54 99 (45 61) ---38 25 79 65 27  # feat_40 (Артем)
         }

# Step 3. LightAutoML model creation - TabularAutoML preset

In next the cell we are going to create LightAutoML model with `TabularAutoML` class - preset with default model structure like in the image below:

<img src="https://github.com/sberbank-ai-lab/lightautoml-datafest-workshop/raw/master/imgs/tutorial_blackbox_pipeline.png" alt="TabularAutoML preset pipeline" style="width:70%;"/>

in just several lines. Let's discuss the params we can setup:
- `task` - the type of the ML task (the only **must have** parameter)
- `timeout` - time limit in seconds for model to train
- `cpu_limit` - vCPU count for model to use
- `reader_params` - parameter change for Reader object inside preset, which works on the first step of data preparation: automatic feature typization, preliminary almost-constant features, correct CV setup etc. For example, we setup `n_jobs` threads for typization algo, `cv` folds and `random_state` as inside CV seed.
- `general_params` - we use `use_algos` key to setup the model structure to work with (Linear and LGBM model on the first level and their weighted composition creation on the second). This setup is only to speedup the kernel, you can remove this `general_params` setup if you want the whole LightAutoML model to run.

**Important note**: `reader_params` key is one of the YAML config keys, which is used inside `TabularAutoML` preset. [More details](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/lightautoml/automl/presets/tabular_config.yml) on its structure with explanation comments can be found on the link attached. Each key from this config can be modified with user settings during preset object initialization. To get more info about different parameters setting (for example, ML algos which can be used in `general_params->use_algos`) please take a look at our [article on TowardsDataScience](https://towardsdatascience.com/lightautoml-preset-usage-tutorial-2cce7da6f936).

Moreover, to receive the automatic report for our model we will use `ReportDeco` decorator and work with the decorated version in the same way as we do with usual one. 

In [None]:
automl = TabularNLPAutoML(task = task, 
                           timeout = TIMEOUT,
                           cpu_limit = N_THREADS,
                           general_params = {'nested_cv':False, 'use_algos': [['lgb_tuned', 'cb_tuned']]}, # linear_l2
                           selection_params = {'mode': 1},
                           reader_params = {'n_jobs': N_THREADS, 'random_state': 22}, # 'cv': N_FOLDS, 
                           #nested_cv_params= {'cv': N_FOLDS,'n_folds': True , 'inner_tune': True }, #refit_tuner: True
                           linear_pipeline_params = {'text_features': "tfidf"},
                           gbm_pipeline_params = {'text_features': "embed"},
                           text_params = {'lang': 'multi'}
                          )

oof_pred = automl.fit_predict(ALL_DF, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
oof_pred_train = oof_pred.data[:len(train_data), 0]

In [None]:
print('OOF score: {}'.format(f1_score(train_data[TARGET_NAME].values, (oof_pred_train > 0.5).astype(int))))

# Step 5. Threshold optimization

Threshold 0.5 is not always the best option. Let's find the best one by OOF predictions and use it for holdout and test:

In [None]:
best_sc = -1
best_w = None
for w in np.arange(0.35, 1.01, 0.0015):
    sc = f1_score(train_data[TARGET_NAME].values, (oof_pred_train > w).astype(int))
    if sc > best_sc:
        best_sc = sc
        best_w = w
        print(best_sc, round(best_w, 2))

In [None]:
print('Check score with optimized threshold...')
print('OOF score: {}'.format(f1_score(train_data[TARGET_NAME].values, (oof_pred_train > best_w).astype(int))))

# Step 6. Prediction on test dataset

In the cell below we need to use `automl_rd.model` as we don't have target variable inside test dataset and there is no point to include it into the report:

In [None]:
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

# Step 7. Create submission file

In [None]:
sample_sub[TARGET_NAME] = (test_pred.data[:, 0] > best_w).astype(int)
sample_sub.to_csv('lightautoml_NLP_benchmark_with_pseudolabelling.csv', index = False)

In [None]:
RD = ReportDeco(output_path = 'tabularAutoML_model_report')
automl_rd = RD(automl)

In [None]:
# Fast feature importances calculation
fast_fi = automl_rd.model.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)\

In [None]:
fast_fi.iloc[::-1,::-1][:20]['Feature'].array

In [None]:
# accurate_fi = automl_rd.model.get_feature_scores('accurate')
# accurate_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

# Additional materials

- [Official LightAutoML github repo](https://github.com/sberbank-ai-lab/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [Pseudolabelling in the nutshell](https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/231738#1268903)

## Do not forget to upvote if you like the kernel 👍