In [None]:
%%capture
!pip install -U lightautoml

In [None]:
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch

from sklearn.preprocessing import LabelEncoder

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector,ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [None]:
N_THREADS = 4
N_FOLDS = 20
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 5*60*60 
TARGET_NAME = 'Target'

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [None]:
%%time
train_data, test_data = train_test_split(train,
test_size=TEST_SIZE,
stratify=train[TARGET_NAME],
random_state=RANDOM_STATE)

In [None]:
task = Task('multiclass')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [None]:
 %%time
model0 = BoostLGBM(
default_params={'learning_rate': 0.04999, 'num_leaves': 78, 'seed': 42, 'num_threads':N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

In [None]:
pipe = LGBSimpleFeatures()
params_tuner1 = OptunaTuner(n_trials=100, timeout=3000) # stop after 20 iterations or after30 seconds
model1 = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads':N_THREADS}
)
model2 = BoostLGBM(
default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads':N_THREADS}
)
pipeline_lvl1 = MLPipeline([
(model1, params_tuner1),
model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

In [None]:
%%time
pipe1 = LGBSimpleFeatures()
model = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 66, 'max_bin': 1024, 'seed': 3,'num_threads': N_THREADS},
freeze_defaults=True
)
pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

In [None]:
 %%time
automl = AutoML(reader, [
[pipeline_lvl1],
[pipeline_lvl2],
], skip_conn=False)

In [None]:
 %%time
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME,'drop': ['id']})
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


In [None]:
%%time
print('Feature importances of selector:\n{}'
.format(selector.get_features_score()))
print('=' * 70)
print('Feature importances of top level algorithm:\n{}'
.format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 0:\n{}'
.format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 1:\n{}'
.format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)


In [None]:
# print(automl.create_model_str_desc())

In [None]:
%%time
test_pred = automl.predict(test_data)
test_pred

In [None]:
%%time
class_mapping = {0:'Graduate', 1:'Dropout', 2:'Enrolled'}
te_pred = np.argmax(np.array(test_pred.data),axis=1)
final_test_pred = [class_mapping[value] for value in te_pred]

oof_pred_ = np.argmax(np.array(oof_pred.data),axis=1)
final_oof_pred = [class_mapping[value] for value in oof_pred_]

In [None]:
 %%time
#test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
.format(test_pred, test_pred.shape))
print('Check scores...')
print('OOF score: {}'.format(accuracy_score(train_data[TARGET_NAME].values, final_oof_pred)))
print('TEST score: {}'.format(accuracy_score(test_data[TARGET_NAME].values, final_test_pred)))


## Final Prediction on test set

In [None]:
%%time
preds = automl.predict(test)

In [None]:
%%time
final_preds = np.argmax(np.array(preds.data),axis=1)

In [None]:
%%time
final_prediction = [class_mapping[value] for value in final_preds]

In [None]:
%%time
submit = pd.read_csv('/kaggle/input/playground-series-s4e6/sample_submission.csv')
submit.Target = final_prediction
submit.to_csv('submission.csv',index=False)
submit